import ceph pacific 16.2.11

Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
This commit is contained in:
Thomas Lamprecht 2023-01-27 10:41:18 +01:00
parent ae4ec110fe
commit e8875651b9
1131 changed files with 217343 additions and 128572 deletions

View File

@ -49,3 +49,88 @@ COPYING* @ceph/doc-writers
/doc/ @ceph/doc-writers
README* @ceph/doc-writers
*.rst @ceph/doc-writers
# core
/doc/man/8/ceph-authtool.rst @ceph/core
/doc/man/8/ceph-conf.rst @ceph/core
/doc/man/8/ceph-create-keys.rst @ceph/core
/doc/man/8/ceph-kvstore-tool.rst @ceph/core
/doc/man/8/ceph-mon.rst @ceph/core
/doc/man/8/ceph-objectstore-tool.rst @ceph/core
/doc/man/8/ceph-osd.rst @ceph/core
/doc/man/8/ceph.rst @ceph/core
/doc/man/8/crushtool.rst @ceph/core
/doc/man/8/monmaptool.rst @ceph/core
/doc/man/8/rados.rst @ceph/core
/doc/rados @ceph/core
/qa/standalone @ceph/core
/qa/suites/rados @ceph/core
/qa/workunits/erasure-code @ceph/core
/qa/workunits/mgr @ceph/core
/qa/workunits/mon @ceph/core
/qa/workunits/objectstore @ceph/core
/qa/workunits/rados @ceph/core
/src/ceph.in @ceph/core
/src/ceph_osd.cc @ceph/core
/src/ceph_mon.cc @ceph/core
/src/blk @ceph/core
/src/crush @ceph/core
/src/erasure-code @ceph/core
/src/kv @ceph/core
/src/librados @ceph/core
/src/mgr @ceph/core
/src/mon @ceph/core
/src/msg @ceph/core
/src/os @ceph/core
/src/osd @ceph/core
/src/tools/rados @ceph/core
/src/test/osd @ceph/core
# rbd
/doc/dev/rbd* @ceph/rbd
/doc/man/8/ceph-rbdnamer.rst @ceph/rbd
/doc/man/8/rbd* @ceph/rbd
/doc/rbd @ceph/rbd
/doc/start/quick-rbd.rst @ceph/rbd
/qa/rbd @ceph/rbd
/qa/run_xfstests* @ceph/rbd
/qa/suites/krbd @ceph/rbd
/qa/suites/rbd @ceph/rbd
/qa/tasks/ceph_iscsi_client.py @ceph/rbd
/qa/tasks/metadata.yaml @ceph/rbd
/qa/tasks/qemu.py @ceph/rbd
/qa/tasks/rbd* @ceph/rbd
/qa/tasks/userdata* @ceph/rbd
/qa/workunits/cls/test_cls_journal.sh @ceph/rbd
/qa/workunits/cls/test_cls_lock.sh @ceph/rbd
/qa/workunits/cls/test_cls_rbd.sh @ceph/rbd
/qa/workunits/rbd @ceph/rbd
/src/ceph-rbdnamer @ceph/rbd
/src/cls/journal @ceph/rbd
/src/cls/lock @ceph/rbd
/src/cls/rbd @ceph/rbd
/src/common/options/rbd* @ceph/rbd
/src/etc-rbdmap @ceph/rbd
/src/include/krbd.h @ceph/rbd
/src/include/rbd* @ceph/rbd
/src/journal @ceph/rbd
/src/krbd.cc @ceph/rbd
/src/librbd @ceph/rbd
/src/ocf @ceph/rbd
/src/pybind/mgr/rbd_support @ceph/rbd
/src/pybind/rbd @ceph/rbd
/src/rbd* @ceph/rbd
/src/test/cli/rbd @ceph/rbd
/src/test/cli-integration/rbd @ceph/rbd
/src/test/cls_journal @ceph/rbd
/src/test/cls_lock @ceph/rbd
/src/test/cls_rbd @ceph/rbd
/src/test/journal @ceph/rbd
/src/test/librbd @ceph/rbd
/src/test/pybind/test_rbd.py @ceph/rbd
/src/test/rbd* @ceph/rbd
/src/test/run-rbd* @ceph/rbd
/src/test/test_rbd* @ceph/rbd
/src/tools/rbd* @ceph/rbd
/systemd/rbdmap.service.in @ceph/rbd
/udev/50-rbd.rules @ceph/rbd

View File

@ -12,13 +12,13 @@ jobs:
with:
sync-labels: ''
repo-token: "${{ secrets.GITHUB_TOKEN }}"
- name: Assign to Dashboard project
uses: srggrs/assign-one-project-github-action@65a8ddab497df42ef268001e67bbf976f8fd39e1
if: contains(github.event.pull_request.labels.*.name, 'dashboard')
with:
project: https://github.com/ceph/ceph/projects/6
- name: Assign milestone based on target brach name
uses: iyu/actions-milestone@dbf7e5348844c9ddc6b803a5721b85fa70fe3bb9
with:
configuration-path: .github/milestone.yml
repo-token: "${{ secrets.GITHUB_TOKEN }}"
- name: Assign to Dashboard project
uses: srggrs/assign-one-project-github-action@65a8ddab497df42ef268001e67bbf976f8fd39e1
if: contains(github.event.pull_request.labels.*.name, 'dashboard')
with:
project: https://github.com/ceph/ceph/projects/6

View File

@ -5,9 +5,13 @@
version: 2
formats: []
build:
image: latest
os: ubuntu-22.04
tools:
python: "3.8"
apt_packages:
- ditaa
- graphviz
python:
version: 3
install:
- requirements: admin/doc-requirements.txt
- requirements: admin/doc-read-the-docs.txt

View File

@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.10.2)
# remove cmake/modules/FindPython* once 3.12 is required
project(ceph
VERSION 16.2.10
VERSION 16.2.11
LANGUAGES CXX C ASM)
foreach(policy
@ -36,7 +36,15 @@ if(WIN32)
# the targeted Windows version. The availability of certain functions and
# structures will depend on it.
set(WIN32_WINNT "0x0A00" CACHE STRING "Targeted Windows version.")
add_definitions(-D_WIN32_WINNT=${WIN32_WINNT})
# In order to avoid known winpthread issues, we're using the boost
# shared mutex implementation.
# https://github.com/msys2/MINGW-packages/issues/3319
add_definitions(
-D_WIN32_WINNT=${WIN32_WINNT}
-DBOOST_THREAD_PROVIDES_GENERIC_SHARED_MUTEX_ON_WIN
-DBOOST_THREAD_V2_SHARED_MUTEX
)
set(Boost_THREADAPI "win32")
endif()
if(MINGW)

View File

@ -32,9 +32,33 @@
in certain recovery scenarios, e.g., monitor database lost and rebuilt, and
the restored file system is expected to have the same ID as before.
>=16.2.11
--------
* Cephfs: The 'AT_NO_ATTR_SYNC' macro is deprecated, please use the standard
'AT_STATX_DONT_SYNC' macro. The 'AT_NO_ATTR_SYNC' macro will be removed in
the future.
* Trimming of PGLog dups is now controlled by the size instead of the version.
This fixes the PGLog inflation issue that was happening when the on-line
(in OSD) trimming got jammed after a PG split operation. Also, a new off-line
mechanism has been added: `ceph-objectstore-tool` got `trim-pg-log-dups` op
that targets situations where OSD is unable to boot due to those inflated dups.
If that is the case, in OSD logs the "You can be hit by THE DUPS BUG" warning
will be visible.
Relevant tracker: https://tracker.ceph.com/issues/53729
* RBD: `rbd device unmap` command gained `--namespace` option. Support for
namespaces was added to RBD in Nautilus 14.2.0 and it has been possible to
map and unmap images in namespaces using the `image-spec` syntax since then
but the corresponding option available in most other commands was missing.
>=16.2.8
--------
* RGW: The behavior for Multipart Upload was modified so that only
CompleteMultipartUpload notification is sent at the end of the multipart upload.
The POST notification at the beginning of the upload, and PUT notifications that
were sent on each part are not sent anymore.
* MON/MGR: Pools can now be created with `--bulk` flag. Any pools created with `bulk`
will use a profile of the `pg_autoscaler` that provides more performance from the start.
However, any pools created without the `--bulk` flag will remain using it's old behavior

View File

@ -29,7 +29,11 @@
%else
%bcond_without tcmalloc
%endif
%if 0%{?rhel} >= 9
%bcond_without system_pmdk
%else
%bcond_with system_pmdk
%endif
%if 0%{?fedora} || 0%{?rhel}
%bcond_without selinux
%ifarch x86_64 ppc64le
@ -120,11 +124,18 @@
# disable dwz which compresses the debuginfo
%global _find_debuginfo_dwz_opts %{nil}
%if 0%{with seastar}
# disable -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1, as gcc-toolset-{9,10}-annobin
# do not provide gcc-annobin.so anymore, despite that they provide annobin.so. but
# redhat-rpm-config still passes -fplugin=gcc-annobin to the compiler.
%undefine _annotated_build
%endif
#################################################################################
# main package definition
#################################################################################
Name: ceph
Version: 16.2.10
Version: 16.2.11
Release: 0%{?dist}
%if 0%{?fedora} || 0%{?rhel}
Epoch: 2
@ -140,7 +151,7 @@ License: LGPL-2.1 and LGPL-3.0 and CC-BY-SA-3.0 and GPL-2.0 and BSL-1.0 and BSD-
Group: System/Filesystems
%endif
URL: http://ceph.com/
Source0: %{?_remote_tarball_prefix}ceph-16.2.10.tar.bz2
Source0: %{?_remote_tarball_prefix}ceph-16.2.11.tar.bz2
%if 0%{?suse_version}
# _insert_obs_source_lines_here
ExclusiveArch: x86_64 aarch64 ppc64le s390x
@ -229,7 +240,6 @@ BuildRequires: %{luarocks_package_name}
BuildRequires: jq
BuildRequires: libuuid-devel
BuildRequires: python%{python3_pkgversion}-bcrypt
BuildRequires: python%{python3_pkgversion}-nose
BuildRequires: python%{python3_pkgversion}-pecan
BuildRequires: python%{python3_pkgversion}-requests
BuildRequires: python%{python3_pkgversion}-dateutil
@ -304,6 +314,7 @@ BuildRequires: rdma-core-devel
BuildRequires: liblz4-devel >= 1.7
# for prometheus-alerts
BuildRequires: golang-github-prometheus-prometheus
BuildRequires: jsonnet
%endif
%if 0%{?fedora} || 0%{?rhel}
Requires: systemd
@ -345,6 +356,7 @@ BuildRequires: python%{python3_pkgversion}-pyOpenSSL
%endif
%if 0%{?suse_version}
BuildRequires: golang-github-prometheus-prometheus
BuildRequires: jsonnet
BuildRequires: libxmlsec1-1
BuildRequires: libxmlsec1-nss1
BuildRequires: libxmlsec1-openssl1
@ -548,6 +560,7 @@ Group: System/Filesystems
Requires: ceph-mgr = %{_epoch_prefix}%{version}-%{release}
Requires: ceph-grafana-dashboards = %{_epoch_prefix}%{version}-%{release}
Requires: ceph-prometheus-alerts = %{_epoch_prefix}%{version}-%{release}
Requires: python%{python3_pkgversion}-setuptools
%if 0%{?fedora} || 0%{?rhel}
Requires: python%{python3_pkgversion}-cherrypy
Requires: python%{python3_pkgversion}-jwt
@ -597,6 +610,7 @@ Requires: python%{python3_pkgversion}-pecan
Requires: python%{python3_pkgversion}-pyOpenSSL
Requires: python%{python3_pkgversion}-requests
Requires: python%{python3_pkgversion}-dateutil
Requires: python%{python3_pkgversion}-setuptools
%if 0%{?fedora} || 0%{?rhel} >= 8
Requires: python%{python3_pkgversion}-cherrypy
Requires: python%{python3_pkgversion}-pyyaml
@ -1194,12 +1208,14 @@ This package provides Ceph default alerts for Prometheus.
# common
#################################################################################
%prep
%autosetup -p1 -n ceph-16.2.10
%autosetup -p1 -n ceph-16.2.11
%build
# LTO can be enabled as soon as the following GCC bug is fixed:
# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=48200
# Disable lto on systems that do not support symver attribute
# See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=48200 for details
%if ( 0%{?rhel} && 0%{?rhel} < 9 ) || ( 0%{?suse_version} && 0%{?suse_version} <= 1500 )
%define _lto_cflags %{nil}
%endif
%if 0%{with seastar} && 0%{?rhel}
. /opt/rh/gcc-toolset-9/enable
@ -1433,6 +1449,9 @@ install -m 644 -D monitoring/ceph-mixin/prometheus_alerts.yml %{buildroot}/etc/p
%clean
rm -rf %{buildroot}
# built binaries are no longer necessary at this point,
# but are consuming ~17GB of disk in the build environment
rm -rf build
#################################################################################
# files and systemd scriptlets
@ -1528,8 +1547,7 @@ exit 0
%if ! 0%{?suse_version}
%postun -n cephadm
userdel -r cephadm || true
exit 0
[ $1 -ne 0 ] || userdel cephadm || :
%endif
%files -n cephadm
@ -1566,6 +1584,8 @@ exit 0
%{_bindir}/rbd-replay-prep
%endif
%{_bindir}/ceph-post-file
%dir %{_libdir}/ceph/denc
%{_libdir}/ceph/denc/denc-mod-*.so
%{_tmpfilesdir}/ceph-common.conf
%{_mandir}/man8/ceph-authtool.8*
%{_mandir}/man8/ceph-conf.8*

View File

@ -29,7 +29,11 @@
%else
%bcond_without tcmalloc
%endif
%if 0%{?rhel} >= 9
%bcond_without system_pmdk
%else
%bcond_with system_pmdk
%endif
%if 0%{?fedora} || 0%{?rhel}
%bcond_without selinux
%ifarch x86_64 ppc64le
@ -120,6 +124,13 @@
# disable dwz which compresses the debuginfo
%global _find_debuginfo_dwz_opts %{nil}
%if 0%{with seastar}
# disable -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1, as gcc-toolset-{9,10}-annobin
# do not provide gcc-annobin.so anymore, despite that they provide annobin.so. but
# redhat-rpm-config still passes -fplugin=gcc-annobin to the compiler.
%undefine _annotated_build
%endif
#################################################################################
# main package definition
#################################################################################
@ -229,7 +240,6 @@ BuildRequires: %{luarocks_package_name}
BuildRequires: jq
BuildRequires: libuuid-devel
BuildRequires: python%{python3_pkgversion}-bcrypt
BuildRequires: python%{python3_pkgversion}-nose
BuildRequires: python%{python3_pkgversion}-pecan
BuildRequires: python%{python3_pkgversion}-requests
BuildRequires: python%{python3_pkgversion}-dateutil
@ -304,6 +314,7 @@ BuildRequires: rdma-core-devel
BuildRequires: liblz4-devel >= 1.7
# for prometheus-alerts
BuildRequires: golang-github-prometheus-prometheus
BuildRequires: jsonnet
%endif
%if 0%{?fedora} || 0%{?rhel}
Requires: systemd
@ -345,6 +356,7 @@ BuildRequires: python%{python3_pkgversion}-pyOpenSSL
%endif
%if 0%{?suse_version}
BuildRequires: golang-github-prometheus-prometheus
BuildRequires: jsonnet
BuildRequires: libxmlsec1-1
BuildRequires: libxmlsec1-nss1
BuildRequires: libxmlsec1-openssl1
@ -548,6 +560,7 @@ Group: System/Filesystems
Requires: ceph-mgr = %{_epoch_prefix}%{version}-%{release}
Requires: ceph-grafana-dashboards = %{_epoch_prefix}%{version}-%{release}
Requires: ceph-prometheus-alerts = %{_epoch_prefix}%{version}-%{release}
Requires: python%{python3_pkgversion}-setuptools
%if 0%{?fedora} || 0%{?rhel}
Requires: python%{python3_pkgversion}-cherrypy
Requires: python%{python3_pkgversion}-jwt
@ -597,6 +610,7 @@ Requires: python%{python3_pkgversion}-pecan
Requires: python%{python3_pkgversion}-pyOpenSSL
Requires: python%{python3_pkgversion}-requests
Requires: python%{python3_pkgversion}-dateutil
Requires: python%{python3_pkgversion}-setuptools
%if 0%{?fedora} || 0%{?rhel} >= 8
Requires: python%{python3_pkgversion}-cherrypy
Requires: python%{python3_pkgversion}-pyyaml
@ -1197,9 +1211,11 @@ This package provides Ceph default alerts for Prometheus.
%autosetup -p1 -n @TARBALL_BASENAME@
%build
# LTO can be enabled as soon as the following GCC bug is fixed:
# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=48200
# Disable lto on systems that do not support symver attribute
# See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=48200 for details
%if ( 0%{?rhel} && 0%{?rhel} < 9 ) || ( 0%{?suse_version} && 0%{?suse_version} <= 1500 )
%define _lto_cflags %{nil}
%endif
%if 0%{with seastar} && 0%{?rhel}
. /opt/rh/gcc-toolset-9/enable
@ -1433,6 +1449,9 @@ install -m 644 -D monitoring/ceph-mixin/prometheus_alerts.yml %{buildroot}/etc/p
%clean
rm -rf %{buildroot}
# built binaries are no longer necessary at this point,
# but are consuming ~17GB of disk in the build environment
rm -rf build
#################################################################################
# files and systemd scriptlets
@ -1528,8 +1547,7 @@ exit 0
%if ! 0%{?suse_version}
%postun -n cephadm
userdel -r cephadm || true
exit 0
[ $1 -ne 0 ] || userdel cephadm || :
%endif
%files -n cephadm
@ -1566,6 +1584,8 @@ exit 0
%{_bindir}/rbd-replay-prep
%endif
%{_bindir}/ceph-post-file
%dir %{_libdir}/ceph/denc
%{_libdir}/ceph/denc/denc-mod-*.so
%{_tmpfilesdir}/ceph-common.conf
%{_mandir}/man8/ceph-authtool.8*
%{_mandir}/man8/ceph-conf.8*

View File

@ -1,7 +1,13 @@
ceph (16.2.10-1focal) focal; urgency=medium
ceph (16.2.11-1focal) focal; urgency=medium
-- Jenkins Build Slave User <jenkins-build@adami06.front.sepia.ceph.com> Thu, 21 Jul 2022 17:38:01 +0000
-- Jenkins Build Slave User <jenkins-build@braggi16.front.sepia.ceph.com> Tue, 24 Jan 2023 21:28:06 +0000
ceph (16.2.11-1) stable; urgency=medium
* New upstream release
-- Ceph Release Team <ceph-maintainers@ceph.io> Tue, 24 Jan 2023 20:43:11 +0000
ceph (16.2.10-1) stable; urgency=medium

View File

@ -11,6 +11,9 @@
# Boost_USE_MULTITHREADED : boolean (default: OFF)
# BOOST_J: integer (defanult 1)
# CMAKE_CURRENT_FUNCTION_LIST_DIR is introduced by cmake 3.17, but ubuntu comes with 3.16
set(_build_boost_list_dir "${CMAKE_CURRENT_LIST_DIR}")
function(check_boost_version source_dir expected_version)
set(version_hpp "${source_dir}/boost/version.hpp")
if(NOT EXISTS ${version_hpp})
@ -70,7 +73,7 @@ function(do_build_boost version)
if(c MATCHES "^python([0-9])\$")
set(with_python_version "${CMAKE_MATCH_1}")
list(APPEND boost_with_libs "python")
elseif(c MATCHES "^python([0-9])\\.?([0-9])\$")
elseif(c MATCHES "^python([0-9])\\.?([0-9]+)\$")
set(with_python_version "${CMAKE_MATCH_1}.${CMAKE_MATCH_2}")
list(APPEND boost_with_libs "python")
else()
@ -167,10 +170,12 @@ function(do_build_boost version)
URL_HASH SHA256=${boost_sha256}
DOWNLOAD_NO_PROGRESS 1)
endif()
find_program(PATCH_EXECUTABLE patch)
# build all components in a single shot
include(ExternalProject)
ExternalProject_Add(Boost
${source_dir}
PATCH_COMMAND ${PATCH_EXECUTABLE} -p3 -i ${_build_boost_list_dir}/boost-python-use-public-api-for-filename.patch
CONFIGURE_COMMAND CC=${CMAKE_C_COMPILER} CXX=${CMAKE_CXX_COMPILER} ${configure_command}
BUILD_COMMAND CC=${CMAKE_C_COMPILER} CXX=${CMAKE_CXX_COMPILER} ${build_command}
BUILD_IN_SOURCE 1

View File

@ -9,14 +9,15 @@ function(build_fio)
include(FindMake)
find_make("MAKE_EXECUTABLE" "make_cmd")
set(source_dir ${CMAKE_BINARY_DIR}/src/fio)
file(MAKE_DIRECTORY ${source_dir})
ExternalProject_Add(fio_ext
DOWNLOAD_DIR ${CMAKE_BINARY_DIR}/src/
UPDATE_COMMAND "" # this disables rebuild on each run
GIT_REPOSITORY "https://github.com/axboe/fio.git"
GIT_REPOSITORY "https://github.com/ceph/fio.git"
GIT_CONFIG advice.detachedHead=false
GIT_SHALLOW 1
GIT_TAG "fio-3.15"
SOURCE_DIR ${CMAKE_BINARY_DIR}/src/fio
GIT_TAG "fio-3.27-cxx"
SOURCE_DIR ${source_dir}
BUILD_IN_SOURCE 1
CONFIGURE_COMMAND <SOURCE_DIR>/configure
BUILD_COMMAND ${make_cmd} fio EXTFLAGS=-Wno-format-truncation ${FIO_EXTLIBS}
@ -25,5 +26,6 @@ function(build_fio)
add_library(fio INTERFACE IMPORTED)
add_dependencies(fio fio_ext)
set_target_properties(fio PROPERTIES
INTERFACE_INCLUDE_DIRECTORIES ${CMAKE_BINARY_DIR}/src/fio)
INTERFACE_INCLUDE_DIRECTORIES ${source_dir}
INTERFACE_COMPILE_OPTIONS "-include;${source_dir}/config-host.h;$<$<COMPILE_LANGUAGE:C>:-std=gnu99>$<$<COMPILE_LANGUAGE:CXX>:-std=gnu++17>")
endfunction()

View File

@ -21,6 +21,7 @@ function(build_pmem)
set(PMDK_LIB_DIR "nondebug")
endif()
set(pmdk_cflags "-Wno-error -fno-lto")
include(ExternalProject)
ExternalProject_Add(pmdk_ext
${source_dir_args}
@ -29,7 +30,7 @@ function(build_pmem)
# build system tests statically linking to librbd (which uses
# libpmemobj) will not link (because we don't build the ndctl
# static library here).
BUILD_COMMAND ${make_cmd} CC=${CMAKE_C_COMPILER} NDCTL_ENABLE=n BUILD_EXAMPLES=n BUILD_BENCHMARKS=n DOC=n
BUILD_COMMAND ${make_cmd} CC=${CMAKE_C_COMPILER} "EXTRA_CFLAGS=${pmdk_cflags}" NDCTL_ENABLE=n BUILD_EXAMPLES=n BUILD_BENCHMARKS=n DOC=n
BUILD_IN_SOURCE 1
BUILD_BYPRODUCTS "<SOURCE_DIR>/src/${PMDK_LIB_DIR}/libpmem.a" "<SOURCE_DIR>/src/${PMDK_LIB_DIR}/libpmemobj.a"
INSTALL_COMMAND "")

View File

@ -144,6 +144,34 @@ else(NOT CMAKE_CROSSCOMPILING)
message(STATUS "Assuming unaligned access is supported")
endif(NOT CMAKE_CROSSCOMPILING)
set(version_script_source "v1 { }; v2 { } v1;")
file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/version_script.txt "${version_script_source}")
cmake_push_check_state(RESET)
set(CMAKE_REQUIRED_FLAGS "-Werror -Wl,--version-script=${CMAKE_CURRENT_BINARY_DIR}/version_script.txt")
check_c_source_compiles("
__attribute__((__symver__ (\"func@v1\"))) void func_v1() {};
__attribute__((__symver__ (\"func@v2\"))) void func_v2() {};
int main() {}"
HAVE_ATTR_SYMVER)
if(NOT HAVE_ATTR_SYMVER)
if(CMAKE_CXX_FLAGS MATCHES "-flto" AND NOT CMAKE_CXX_FLAGS MATCHES "-flto-partition=none")
# https://tracker.ceph.com/issues/40060
message(FATAL_ERROR "please pass -flto-partition=none as part of CXXFLAGS")
endif()
endif()
set(CMAKE_REQUIRED_FLAGS -Wl,--version-script=${CMAKE_CURRENT_BINARY_DIR}/version_script.txt)
check_c_source_compiles("
void func_v1() {}
__asm__(\".symver func_v1, func@v1\");
void func_v2() {}
__asm__(\".symver func_v2, func@v2\");
int main() {}"
HAVE_ASM_SYMVER)
file(REMOVE ${CMAKE_CURRENT_BINARY_DIR}/version_script.txt)
cmake_pop_check_state()
# should use LINK_OPTIONS instead of LINK_LIBRARIES, if we can use cmake v3.14+
try_compile(HAVE_LINK_VERSION_SCRIPT
${CMAKE_CURRENT_BINARY_DIR}

View File

@ -65,14 +65,13 @@ function(distutils_add_cython_module target name src)
# This little bit of magic wipes out __Pyx_check_single_interpreter()
# Note: this is reproduced in distutils_install_cython_module
list(APPEND cflags -D'void0=dead_function\(void\)')
list(APPEND cflags -D'__Pyx_check_single_interpreter\(ARG\)=ARG \#\# 0')
list(APPEND cflags -D'__Pyx_check_single_interpreter\(ARG\)=ARG\#\#0')
set(PY_CC ${compiler_launcher} ${CMAKE_C_COMPILER} ${c_compiler_arg1} ${cflags})
set(PY_CXX ${compiler_launcher} ${CMAKE_CXX_COMPILER} ${cxx_compiler_arg1})
set(PY_LDSHARED ${link_launcher} ${CMAKE_C_COMPILER} ${c_compiler_arg1} "-shared")
set(suffix_var "EXT_SUFFIX")
execute_process(COMMAND "${Python3_EXECUTABLE}" -c
"from distutils import sysconfig; print(sysconfig.get_config_var('${suffix_var}'))"
"import sysconfig; print(sysconfig.get_config_var('EXT_SUFFIX'))"
RESULT_VARIABLE result
OUTPUT_VARIABLE ext_suffix
ERROR_VARIABLE error
@ -113,7 +112,7 @@ function(distutils_install_cython_module name)
set(ENV{LDSHARED} \"${PY_LDSHARED}\")
set(ENV{CPPFLAGS} \"-iquote${CMAKE_SOURCE_DIR}/src/include
-D'void0=dead_function\(void\)' \
-D'__Pyx_check_single_interpreter\(ARG\)=ARG \#\# 0'\")
-D'__Pyx_check_single_interpreter\(ARG\)=ARG\#\#0'\")
set(ENV{LDFLAGS} \"-L${CMAKE_LIBRARY_OUTPUT_DIRECTORY}\")
set(ENV{CYTHON_BUILD_DIR} \"${CMAKE_CURRENT_BINARY_DIR}\")
set(ENV{CEPH_LIBDIR} \"${CMAKE_LIBRARY_OUTPUT_DIRECTORY}\")

View File

@ -0,0 +1,38 @@
From d9f06052e28873037db7f98629bce72182a42410 Mon Sep 17 00:00:00 2001
From: Pat Riehecky <riehecky@fnal.gov>
Date: Mon, 29 Jun 2020 10:51:58 -0500
Subject: [PATCH] Convert Python 3.1+ to use public C API for filenames
---
src/exec.cpp | 16 ++++++++++++----
1 file changed, 12 insertions(+), 4 deletions(-)
diff --git a/src/exec.cpp b/src/exec.cpp
index 171c6f4189..b2eabe59f6 100644
--- a/src/boost/libs/python/src/exec.cpp
+++ b/src/boost/libs/python/src/exec.cpp
@@ -104,14 +104,22 @@ object BOOST_PYTHON_DECL exec_file(char const *filename, object global, object l
if (local.is_none()) local = global;
// should be 'char const *' but older python versions don't use 'const' yet.
char *f = const_cast<char *>(filename);
- // Let python open the file to avoid potential binary incompatibilities.
-#if PY_VERSION_HEX >= 0x03040000
- FILE *fs = _Py_fopen(f, "r");
+#if PY_VERSION_HEX >= 0x03010000
+ // Let python manage any UTF bits to avoid potential incompatibilities.
+ PyObject *fo = Py_BuildValue("s", f);
+ PyObject *fb = Py_None;
+ PyUnicode_FSConverter(fo, &fb);
+ f = PyBytes_AsString(fb);
+ FILE *fs = fopen(f, "r");
+ Py_DECREF(fo);
+ Py_DECREF(fb);
#elif PY_VERSION_HEX >= 0x03000000
+ // Let python open the file to avoid potential binary incompatibilities.
PyObject *fo = Py_BuildValue("s", f);
- FILE *fs = _Py_fopen(fo, "r");
+ FILE *fs = _Py_fopen(fo, "r"); // Private CPython API
Py_DECREF(fo);
#else
+ // Let python open the file to avoid potential binary incompatibilities.
PyObject *pyfile = PyFile_FromString(f, const_cast<char*>("r"));
if (!pyfile) throw std::invalid_argument(std::string(f) + " : no such file");
python::handle<> file(pyfile);

View File

@ -23,6 +23,7 @@ usr/bin/rbd-replay*
usr/bin/ceph-post-file
usr/sbin/mount.ceph sbin
usr/lib/ceph/compressor/*
usr/lib/ceph/denc/*
usr/lib/ceph/crypto/* [amd64]
usr/share/man/man8/ceph-authtool.8
usr/share/man/man8/ceph-conf.8

View File

@ -24,6 +24,7 @@ Build-Depends: automake,
g++ (>= 7),
javahelper,
jq <pkg.ceph.check>,
jsonnet <pkg.ceph.check>,
junit4,
libaio-dev,
libbabeltrace-ctf-dev,
@ -37,7 +38,7 @@ Build-Depends: automake,
libcurl4-openssl-dev,
# Jaeger libevent-dev,
libexpat1-dev,
# Make-Check libffi-dev [!amd64],
libffi-dev [!amd64] <pkg.ceph.check>,
libfuse-dev,
libgoogle-perftools-dev [i386 amd64 arm64],
# Crimson libgnutls28-dev,
@ -68,44 +69,44 @@ Build-Depends: automake,
librabbitmq-dev,
librdkafka-dev,
luarocks,
# Make-Check libxmlsec1,
# Make-Check libxmlsec1-nss,
# Make-Check libxmlsec1-openssl,
# Make-Check libxmlsec1-dev,
libxmlsec1 <pkg.ceph.check>,
libxmlsec1-nss <pkg.ceph.check>,
libxmlsec1-openssl <pkg.ceph.check>,
libxmlsec1-dev <pkg.ceph.check>,
# Crimson libyaml-cpp-dev,
# Jaeger nlohmann-json-dev | nlohmann-json3-dev,
parted,
patch,
pkg-config,
# Make-Check prometheus,
prometheus <pkg.ceph.check>,
# Crimson protobuf-compiler,
python3-all-dev,
python3-cherrypy3,
# Make-Check python3-jwt,
# Make-Check python3-nose,
# Make-Check python3-pecan,
# Make-Check python3-bcrypt,
# Make-Check tox,
# Make-Check python3-coverage,
# Make-Check python3-dateutil,
# Make-Check python3-openssl,
# Make-Check python3-prettytable,
# Make-Check python3-requests,
# Make-Check python3-scipy,
python3-jwt <pkg.ceph.check>,
python3-pecan <pkg.ceph.check>,
python3-bcrypt <pkg.ceph.check>,
tox <pkg.ceph.check>,
python3-coverage <pkg.ceph.check>,
python3-dateutil <pkg.ceph.check>,
python3-pkg-resources <pkg.ceph.check>,
python3-openssl <pkg.ceph.check>,
python3-prettytable <pkg.ceph.check>,
python3-requests <pkg.ceph.check>,
python3-scipy <pkg.ceph.check>,
python3-setuptools,
python3-sphinx,
# Make-Check python3-werkzeug,
python3-werkzeug <pkg.ceph.check>,
python3-setuptools,
python3-venv,
# Crimson ragel,
# Make-Check socat,
socat <pkg.ceph.check>,
# Crimson systemtap-sdt-dev,
# Make-Check uuid-dev,
uuid-dev <pkg.ceph.check>,
uuid-runtime,
valgrind,
xfslibs-dev,
# Make-Check xfsprogs,
# Make-Check xmlstarlet,
xfsprogs <pkg.ceph.check>,
xmlstarlet <pkg.ceph.check>,
nasm [amd64],
zlib1g-dev,
# Jaeger Built-Using: libyaml-cpp-dev (>= 0.6),
@ -234,6 +235,8 @@ Depends: ceph-base (= ${binary:Version}),
python3-pecan,
python3-requests,
python3-werkzeug,
libsqlite3-mod-ceph (= ${binary:Version}),
librados2 (= ${binary:Version}),
${misc:Depends},
${python:Depends},
${shlibs:Depends},
@ -258,6 +261,7 @@ Depends: ceph-mgr (= ${binary:Version}),
python3-cherrypy3,
python3-jwt,
python3-bcrypt,
python3-pkg-resources,
python3-werkzeug,
python3-routes,
${misc:Depends},
@ -735,7 +739,8 @@ Description: RADOS distributed object store client C++ library (development file
Package: libsqlite3-mod-ceph
Architecture: any
Section: libs
Depends: ${misc:Depends},
Depends: librados2 (= ${binary:Version}),
${misc:Depends},
${shlibs:Depends},
Description: SQLite3 VFS for Ceph
A SQLite3 VFS for storing and manipulating databases stored on Ceph's RADOS
@ -1249,3 +1254,4 @@ Description: prometheus alerts for the ceph dashboard
.
This package contains alerts used for prometheus to interact with the
Ceph Dashboard.

6
ceph/debian/python3-ceph-argparse.install Normal file → Executable file
View File

@ -1,2 +1,4 @@
usr/lib/python3*/dist-packages/ceph_argparse.py
usr/lib/python3*/dist-packages/ceph_daemon.py
#! /usr/bin/dh-exec
usr/lib/python3*/*-packages/ceph_argparse.py /usr/lib/python3/dist-packages/
usr/lib/python3*/*-packages/ceph_daemon.py /usr/lib/python3/dist-packages/

4
ceph/debian/python3-cephfs.install Normal file → Executable file
View File

@ -1,3 +1,5 @@
usr/lib/python3*/dist-packages/ceph_volume_client.py
#! /usr/bin/dh-exec
usr/lib/python3*/*-packages/ceph_volume_client.py /usr/lib/python3/dist-packages/
usr/lib/python3*/dist-packages/cephfs-*.egg-info
usr/lib/python3*/dist-packages/cephfs.cpython*.so

View File

@ -36,6 +36,13 @@ if [ -r /etc/os-release ]; then
ARGS+=" -DWITH_RADOSGW_AMQP_ENDPOINT=OFF"
ARGS+=" -DWITH_RADOSGW_KAFKA_ENDPOINT=OFF"
;;
ubuntu)
MAJOR_VER=$(echo "$VERSION_ID" | sed -e 's/\..*$//')
if [ "$MAJOR_VER" -ge "22" ] ; then
PYBUILD="3.10"
fi
;;
esac
elif [ "$(uname)" == FreeBSD ] ; then
PYBUILD="3"

View File

@ -1,3 +1,23 @@
dt {
scroll-margin-top: 3em;
}
h2 {
scroll-margin-top: 4em;
}
h3 {
scroll-margin-top: 4em;
}
section {
scroll-margin-top: 4em;
}
span {
scroll-margin-top: 2em;
}
ul.simple > li > ul > li:last-child {
margin-block-end : 1em;
}

View File

@ -13,6 +13,7 @@ replicate and redistribute data dynamically.
.. image:: images/stack.png
.. _arch-ceph-storage-cluster:
The Ceph Storage Cluster
========================
@ -59,7 +60,7 @@ service interfaces built on top of ``librados``.
Storing Data
------------
The Ceph Storage Cluster receives data from :term:`Ceph Clients`--whether it
The Ceph Storage Cluster receives data from :term:`Ceph Client`\s--whether it
comes through a :term:`Ceph Block Device`, :term:`Ceph Object Storage`, the
:term:`Ceph File System` or a custom implementation you create using
``librados``-- which is stored as RADOS objects. Each object is stored on an
@ -80,7 +81,7 @@ stored in a monolithic database-like fashion.
Ceph OSD Daemons store data as objects in a flat namespace (e.g., no
hierarchy of directories). An object has an identifier, binary data, and
metadata consisting of a set of name/value pairs. The semantics are completely
up to :term:`Ceph Clients`. For example, CephFS uses metadata to store file
up to :term:`Ceph Client`\s. For example, CephFS uses metadata to store file
attributes such as the file owner, created date, last modified date, and so
forth.
@ -135,6 +136,8 @@ Placement of Replicated Data`_.
.. index:: architecture; cluster map
.. _architecture_cluster_map:
Cluster Map
~~~~~~~~~~~
@ -581,7 +584,7 @@ objects.
Peering and Sets
~~~~~~~~~~~~~~~~
In previous sections, we noted that Ceph OSD Daemons check each others
In previous sections, we noted that Ceph OSD Daemons check each other's
heartbeats and report back to the Ceph Monitor. Another thing Ceph OSD daemons
do is called 'peering', which is the process of bringing all of the OSDs that
store a Placement Group (PG) into agreement about the state of all of the
@ -1619,13 +1622,13 @@ instance for high availability.
.. _RADOS - A Scalable, Reliable Storage Service for Petabyte-scale Storage Clusters: https://ceph.com/wp-content/uploads/2016/08/weil-rados-pdsw07.pdf
.. _RADOS - A Scalable, Reliable Storage Service for Petabyte-scale Storage Clusters: https://ceph.io/assets/pdfs/weil-rados-pdsw07.pdf
.. _Paxos: https://en.wikipedia.org/wiki/Paxos_(computer_science)
.. _Monitor Config Reference: ../rados/configuration/mon-config-ref
.. _Monitoring OSDs and PGs: ../rados/operations/monitoring-osd-pg
.. _Heartbeats: ../rados/configuration/mon-osd-interaction
.. _Monitoring OSDs: ../rados/operations/monitoring-osd-pg/#monitoring-osds
.. _CRUSH - Controlled, Scalable, Decentralized Placement of Replicated Data: https://ceph.com/wp-content/uploads/2016/08/weil-crush-sc06.pdf
.. _CRUSH - Controlled, Scalable, Decentralized Placement of Replicated Data: https://ceph.io/assets/pdfs/weil-crush-sc06.pdf
.. _Data Scrubbing: ../rados/configuration/osd-config-ref#scrubbing
.. _Report Peering Failure: ../rados/configuration/mon-osd-interaction#osds-report-peering-failure
.. _Troubleshooting Peering Failure: ../rados/troubleshooting/troubleshooting-pg#placement-group-down-peering-failure

View File

@ -2,25 +2,22 @@
``prepare``
===========
This subcommand allows a :term:`filestore` or :term:`bluestore` setup. It is
recommended to pre-provision a logical volume before using it with
``ceph-volume lvm``.
Before you run ``ceph-volume lvm prepare``, we recommend that you provision a
logical volume. Then you can run ``prepare`` on that logical volume.
Logical volumes are not altered except for adding extra metadata.
``prepare`` adds metadata to logical volumes but does not alter them in any
other way.
.. note:: This is part of a two step process to deploy an OSD. If looking for
a single-call way, please see :ref:`ceph-volume-lvm-create`
.. note:: This is part of a two-step process to deploy an OSD. If you prefer
to deploy an OSD by using only one command, see :ref:`ceph-volume-lvm-create`.
To help identify volumes, the process of preparing a volume (or volumes) to
work with Ceph, the tool will assign a few pieces of metadata information using
:term:`LVM tags`.
:term:`LVM tags` makes volumes easy to discover later, and help identify them as
part of a Ceph system, and what role they have (journal, filestore, bluestore,
etc...)
Although :term:`bluestore` is the default, the back end can be specified with:
``prepare`` uses :term:`LVM tags` to assign several pieces of metadata to a
logical volume. Volumes tagged in this way are easier to identify and easier to
use with Ceph. :term:`LVM tags` identify logical volumes by the role that they
play in the Ceph cluster (for example: BlueStore data or BlueStore WAL+DB).
:term:`BlueStore<bluestore>` is the default backend. Ceph permits changing
the backend, which can be done by using the following flags and arguments:
* :ref:`--filestore <ceph-volume-lvm-prepare_filestore>`
* :ref:`--bluestore <ceph-volume-lvm-prepare_bluestore>`
@ -29,50 +26,58 @@ Although :term:`bluestore` is the default, the back end can be specified with:
``bluestore``
-------------
The :term:`bluestore` objectstore is the default for new OSDs. It offers a bit
more flexibility for devices compared to :term:`filestore`.
Bluestore supports the following configurations:
:term:`Bluestore<bluestore>` is the default backend for new OSDs. It
offers more flexibility for devices than :term:`filestore` does. Bluestore
supports the following configurations:
* A block device, a block.wal, and a block.db device
* A block device and a block.wal device
* A block device and a block.db device
* A single block device
* a block device, a block.wal device, and a block.db device
* a block device and a block.wal device
* a block device and a block.db device
* a single block device
The bluestore subcommand accepts physical block devices, partitions on
physical block devices or logical volumes as arguments for the various device parameters
If a physical device is provided, a logical volume will be created. A volume group will
either be created or reused it its name begins with ``ceph``.
This allows a simpler approach at using LVM but at the cost of flexibility:
there are no options or configurations to change how the LV is created.
The ``bluestore`` subcommand accepts physical block devices, partitions on physical
block devices, or logical volumes as arguments for the various device
parameters. If a physical block device is provided, a logical volume will be
created. If the provided volume group's name begins with `ceph`, it will be
created if it does not yet exist and it will be clobbered and reused if it
already exists. This allows for a simpler approach to using LVM but at the
cost of flexibility: no option or configuration can be used to change how the
logical volume is created.
The ``block`` is specified with the ``--data`` flag, and in its simplest use
case it looks like::
case it looks like:
.. prompt:: bash #
ceph-volume lvm prepare --bluestore --data vg/lv
A raw device can be specified in the same way::
A raw device can be specified in the same way:
.. prompt:: bash #
ceph-volume lvm prepare --bluestore --data /path/to/device
For enabling :ref:`encryption <ceph-volume-lvm-encryption>`, the ``--dmcrypt`` flag is required::
For enabling :ref:`encryption <ceph-volume-lvm-encryption>`, the ``--dmcrypt`` flag is required:
.. prompt:: bash #
ceph-volume lvm prepare --bluestore --dmcrypt --data vg/lv
If a ``block.db`` or a ``block.wal`` is needed (they are optional for
bluestore) they can be specified with ``--block.db`` and ``--block.wal``
accordingly. These can be a physical device, a partition or
a logical volume.
If a ``block.db`` device or a ``block.wal`` device is needed, it can be
specified with ``--block.db`` or ``--block.wal``. These can be physical
devices, partitions, or logical volumes. ``block.db`` and ``block.wal`` are
optional for bluestore.
For both ``block.db`` and ``block.wal`` partitions aren't made logical volumes
because they can be used as-is.
For both ``block.db`` and ``block.wal``, partitions can be used as-is, and
therefore are not made into logical volumes.
While creating the OSD directory, the process will use a ``tmpfs`` mount to
place all the files needed for the OSD. These files are initially created by
``ceph-osd --mkfs`` and are fully ephemeral.
While creating the OSD directory, the process uses a ``tmpfs`` mount to hold
the files needed for the OSD. These files are created by ``ceph-osd --mkfs``
and are ephemeral.
A symlink is always created for the ``block`` device, and optionally for
``block.db`` and ``block.wal``. For a cluster with a default name, and an OSD
id of 0, the directory could look like::
A symlink is created for the ``block`` device, and is optional for ``block.db``
and ``block.wal``. For a cluster with a default name and an OSD ID of 0, the
directory looks like this::
# ls -l /var/lib/ceph/osd/ceph-0
lrwxrwxrwx. 1 ceph ceph 93 Oct 20 13:05 block -> /dev/ceph-be2b6fbd-bcf2-4c51-b35d-a35a162a02f0/osd-block-25cf0a05-2bc6-44ef-9137-79d65bd7ad62
@ -85,11 +90,11 @@ id of 0, the directory could look like::
-rw-------. 1 ceph ceph 10 Oct 20 13:05 type
-rw-------. 1 ceph ceph 2 Oct 20 13:05 whoami
In the above case, a device was used for ``block`` so ``ceph-volume`` create
a volume group and a logical volume using the following convention:
In the above case, a device was used for ``block``, so ``ceph-volume`` created
a volume group and a logical volume using the following conventions:
* volume group name: ``ceph-{cluster fsid}`` or if the vg exists already
``ceph-{random uuid}``
* volume group name: ``ceph-{cluster fsid}`` (or if the volume group already
exists: ``ceph-{random uuid}``)
* logical volume name: ``osd-block-{osd_fsid}``
@ -98,78 +103,100 @@ a volume group and a logical volume using the following convention:
``filestore``
-------------
This is the OSD backend that allows preparation of logical volumes for
a :term:`filestore` objectstore OSD.
``Filestore<filestore>`` is the OSD backend that prepares logical volumes for a
:term:`filestore`-backed object-store OSD.
It can use a logical volume for the OSD data and a physical device, a partition
or logical volume for the journal. A physical device will have a logical volume
created on it. A volume group will either be created or reused it its name begins
with ``ceph``. No special preparation is needed for these volumes other than
following the minimum size requirements for data and journal.
The CLI call looks like this of a basic standalone filestore OSD::
``Filestore<filestore>`` uses a logical volume to store OSD data and it uses
physical devices, partitions, or logical volumes to store the journal. If a
physical device is used to create a filestore backend, a logical volume will be
created on that physical device. If the provided volume group's name begins
with `ceph`, it will be created if it does not yet exist and it will be
clobbered and reused if it already exists. No special preparation is needed for
these volumes, but be sure to meet the minimum size requirements for OSD data and
for the journal.
ceph-volume lvm prepare --filestore --data <data block device>
Use the following command to create a basic filestore OSD:
To deploy file store with an external journal::
.. prompt:: bash #
ceph-volume lvm prepare --filestore --data <data block device> --journal <journal block device>
ceph-volume lvm prepare --filestore --data <data block device>
For enabling :ref:`encryption <ceph-volume-lvm-encryption>`, the ``--dmcrypt`` flag is required::
Use this command to deploy filestore with an external journal:
ceph-volume lvm prepare --filestore --dmcrypt --data <data block device> --journal <journal block device>
.. prompt:: bash #
Both the journal and data block device can take three forms:
ceph-volume lvm prepare --filestore --data <data block device> --journal <journal block device>
Use this command to enable :ref:`encryption <ceph-volume-lvm-encryption>`, and note that the ``--dmcrypt`` flag is required:
.. prompt:: bash #
ceph-volume lvm prepare --filestore --dmcrypt --data <data block device> --journal <journal block device>
The data block device and the journal can each take one of three forms:
* a physical block device
* a partition on a physical block device
* a logical volume
When using logical volumes the value *must* be of the format
``volume_group/logical_volume``. Since logical volume names
are not enforced for uniqueness, this prevents accidentally
choosing the wrong volume.
If you use a logical volume to deploy filestore, the value that you pass in the
command *must* be of the format ``volume_group/logical_volume_name``. Since logical
volume names are not enforced for uniqueness, using this format is an important
safeguard against accidentally choosing the wrong volume (and clobbering its data).
When using a partition, it *must* contain a ``PARTUUID``, that can be
discovered by ``blkid``. THis ensure it can later be identified correctly
regardless of the device name (or path).
If you use a partition to deploy filestore, the partition *must* contain a
``PARTUUID`` that can be discovered by ``blkid``. This ensures that the
partition can be identified correctly regardless of the device's name (or path).
For example: passing a logical volume for data and a partition ``/dev/sdc1`` for
the journal::
For example, to use a logical volume for OSD data and a partition
(``/dev/sdc1``) for the journal, run a command of this form:
ceph-volume lvm prepare --filestore --data volume_group/lv_name --journal /dev/sdc1
.. prompt:: bash #
Passing a bare device for data and a logical volume ias the journal::
ceph-volume lvm prepare --filestore --data volume_group/logical_volume_name --journal /dev/sdc1
ceph-volume lvm prepare --filestore --data /dev/sdc --journal volume_group/journal_lv
Or, to use a bare device for data and a logical volume for the journal:
A generated uuid is used to ask the cluster for a new OSD. These two pieces are
crucial for identifying an OSD and will later be used throughout the
:ref:`ceph-volume-lvm-activate` process.
.. prompt:: bash #
ceph-volume lvm prepare --filestore --data /dev/sdc --journal volume_group/journal_lv
A generated UUID is used when asking the cluster for a new OSD. These two
pieces of information (the OSD ID and the OSD UUID) are necessary for
identifying a given OSD and will later be used throughout the
:ref:`activation<ceph-volume-lvm-activate>` process.
The OSD data directory is created using the following convention::
/var/lib/ceph/osd/<cluster name>-<osd id>
At this point the data volume is mounted at this location, and the journal
volume is linked::
To link the journal volume to the mounted data volume, use this command:
ln -s /path/to/journal /var/lib/ceph/osd/<cluster_name>-<osd-id>/journal
.. prompt:: bash #
The monmap is fetched using the bootstrap key from the OSD::
ln -s /path/to/journal /var/lib/ceph/osd/<cluster_name>-<osd-id>/journal
/usr/bin/ceph --cluster ceph --name client.bootstrap-osd
--keyring /var/lib/ceph/bootstrap-osd/ceph.keyring
mon getmap -o /var/lib/ceph/osd/<cluster name>-<osd id>/activate.monmap
To fetch the monmap by using the bootstrap key from the OSD, use this command:
``ceph-osd`` will be called to populate the OSD directory, that is already
mounted, re-using all the pieces of information from the initial steps::
.. prompt:: bash #
/usr/bin/ceph --cluster ceph --name client.bootstrap-osd --keyring
/var/lib/ceph/bootstrap-osd/ceph.keyring mon getmap -o
/var/lib/ceph/osd/<cluster name>-<osd id>/activate.monmap
To populate the OSD directory (which has already been mounted), use this ``ceph-osd`` command:
.. prompt:: bash #
ceph-osd --cluster ceph --mkfs --mkkey -i <osd id> \ --monmap
/var/lib/ceph/osd/<cluster name>-<osd id>/activate.monmap --osd-data \
/var/lib/ceph/osd/<cluster name>-<osd id> --osd-journal
/var/lib/ceph/osd/<cluster name>-<osd id>/journal \ --osd-uuid <osd uuid>
--keyring /var/lib/ceph/osd/<cluster name>-<osd id>/keyring \ --setuser ceph
--setgroup ceph
All of the information from the previous steps is used in the above command.
ceph-osd --cluster ceph --mkfs --mkkey -i <osd id> \
--monmap /var/lib/ceph/osd/<cluster name>-<osd id>/activate.monmap --osd-data \
/var/lib/ceph/osd/<cluster name>-<osd id> --osd-journal /var/lib/ceph/osd/<cluster name>-<osd id>/journal \
--osd-uuid <osd uuid> --keyring /var/lib/ceph/osd/<cluster name>-<osd id>/keyring \
--setuser ceph --setgroup ceph
.. _ceph-volume-lvm-partitions:

View File

@ -113,15 +113,15 @@ Adoption process
ssh-copy-id -f -i ~/ceph.pub root@<host>
.. note::
It is also possible to import an existing ssh key. See
:ref:`ssh errors <cephadm-ssh-errors>` in the troubleshooting
It is also possible to import an existing SSH key. See
:ref:`SSH errors <cephadm-ssh-errors>` in the troubleshooting
document for instructions that describe how to import existing
ssh keys.
SSH keys.
.. note::
It is also possible to have cephadm use a non-root user to ssh
It is also possible to have cephadm use a non-root user to SSH
into cluster hosts. This user needs to have passwordless sudo access.
Use ``ceph cephadm set-user <user>`` and copy the ssh key to that user.
Use ``ceph cephadm set-user <user>`` and copy the SSH key to that user.
See :ref:`cephadm-ssh-user`
#. Tell cephadm which hosts to manage:

View File

@ -8,11 +8,10 @@ Compatibility and Stability
Compatibility with Podman Versions
----------------------------------
Podman and Ceph have different end-of-life strategies that
might make it challenging to find compatible Podman and Ceph
versions
Podman and Ceph have different end-of-life strategies. This means that care
must be taken in finding a version of Podman that is compatible with Ceph.
Those versions are expected to work:
These versions are expected to work:
+-----------+---------------------------------------+
@ -28,7 +27,13 @@ Those versions are expected to work:
+-----------+-------+-------+-------+-------+-------+
.. warning::
Only podman versions that are 2.0.0 and higher work with Ceph Pacific, with the exception of podman version 2.2.1, which does not work with Ceph Pacific. kubic stable is known to work with Ceph Pacific, but it must be run with a newer kernel.
To use Podman with Ceph Pacific, you must use **a version of Podman that
is 2.0.0 or higher**. However, **Podman version 2.2.1 does not work with
Ceph Pacific**.
"Kubic stable" is known to work with Ceph Pacific, but it must be run
with a newer kernel.
.. _cephadm-stability:
@ -36,19 +41,18 @@ Those versions are expected to work:
Stability
---------
Cephadm is actively in development. Please be aware that some
functionality is still rough around the edges. Especially the
following components are working with cephadm, but the
documentation is not as complete as we would like, and there may be some
changes in the near future:
Cephadm is under development. Some functionality is incomplete. Be aware
that some of the components of Ceph may not work perfectly with cephadm.
These include:
- RGW
Cephadm support for the following features is still under development and may see breaking
changes in future releases:
Cephadm support remains under development for the following features:
- Ingress
- Cephadm exporter daemon
- cephfs-mirror
In case you encounter issues, see also :ref:`cephadm-pause`.
If a cephadm command fails or a service stops running properly, see
:ref:`cephadm-pause` for instructions on how to pause the Ceph cluster's
background activity and how to disable cephadm.

View File

@ -4,17 +4,26 @@
Host Management
===============
To list hosts associated with the cluster:
Listing Hosts
=============
Run a command of this form to list hosts associated with the cluster:
.. prompt:: bash #
ceph orch host ls [--format yaml] [--host-pattern <name>] [--label <label>] [--host-status <status>]
ceph orch host ls [--format yaml] [--host-pattern <name>] [--label <label>] [--host-status <status>]
where the optional arguments "host-pattern", "label" and "host-status" are used for filtering.
"host-pattern" is a regex that will match against hostnames and will only return matching hosts
"label" will only return hosts with the given label
"host-status" will only return hosts with the given status (currently "offline" or "maintenance")
Any combination of these filtering flags is valid. You may filter against name, label and/or status simultaneously
In commands of this form, the arguments "host-pattern", "label" and
"host-status" are optional and are used for filtering.
- "host-pattern" is a regex that matches against hostnames and returns only
matching hosts.
- "label" returns only hosts with the specified label.
- "host-status" returns only hosts with the specified status (currently
"offline" or "maintenance").
- Any combination of these filtering flags is valid. It is possible to filter
against name, label and status simultaneously, or to filter against any
proper subset of name, label and status.
.. _cephadm-adding-hosts:
@ -30,7 +39,7 @@ To add each new host to the cluster, perform two steps:
.. prompt:: bash #
ssh-copy-id -f -i /etc/ceph/ceph.pub root@*<new-host>*
ssh-copy-id -f -i /etc/ceph/ceph.pub root@*<new-host>*
For example:
@ -43,7 +52,7 @@ To add each new host to the cluster, perform two steps:
.. prompt:: bash #
ceph orch host add *<newhost>* [*<ip>*] [*<label1> ...*]
ceph orch host add *<newhost>* [*<ip>*] [*<label1> ...*]
For example:
@ -63,54 +72,60 @@ To add each new host to the cluster, perform two steps:
.. prompt:: bash #
ceph orch host add host4 10.10.0.104 --labels _admin
ceph orch host add host4 10.10.0.104 --labels _admin
.. _cephadm-removing-hosts:
Removing Hosts
==============
A host can safely be removed from a the cluster once all daemons are removed from it.
A host can safely be removed from the cluster after all daemons are removed
from it.
To drain all daemons from a host do the following:
To drain all daemons from a host, run a command of the following form:
.. prompt:: bash #
ceph orch host drain *<host>*
ceph orch host drain *<host>*
The '_no_schedule' label will be applied to the host. See :ref:`cephadm-special-host-labels`
The ``_no_schedule`` label will be applied to the host. See
:ref:`cephadm-special-host-labels`.
All osds on the host will be scheduled to be removed. You can check osd removal progress with the following:
All OSDs on the host will be scheduled to be removed. You can check the progress of the OSD removal operation with the following command:
.. prompt:: bash #
ceph orch osd rm status
ceph orch osd rm status
see :ref:`cephadm-osd-removal` for more details about osd removal
See :ref:`cephadm-osd-removal` for more details about OSD removal.
You can check if there are no deamons left on the host with the following:
Use the following command to determine whether any daemons are still on the
host:
.. prompt:: bash #
ceph orch ps <host>
ceph orch ps <host>
Once all daemons are removed you can remove the host with the following:
After all daemons have been removed from the host, remove the host from the
cluster by running the following command:
.. prompt:: bash #
ceph orch host rm <host>
ceph orch host rm <host>
Offline host removal
--------------------
If a host is offline and can not be recovered it can still be removed from the cluster with the following:
Even if a host is offline and can not be recovered, it can be removed from the
cluster by running a command of the following form:
.. prompt:: bash #
ceph orch host rm <host> --offline --force
ceph orch host rm <host> --offline --force
This can potentially cause data loss as osds will be forcefully purged from the cluster by calling ``osd purge-actual`` for each osd.
Service specs that still contain this host should be manually updated.
.. warning:: This can potentially cause data loss. This command forcefully
purges OSDs from the cluster by calling ``osd purge-actual`` for each OSD.
Any service specs that still contain this host should be manually updated.
.. _orchestrator-host-labels:
@ -122,18 +137,24 @@ are free form and have no particular meaning by itself and each host
can have multiple labels. They can be used to specify placement
of daemons. See :ref:`orch-placement-by-labels`
Labels can be added when adding a host with the ``--labels`` flag::
Labels can be added when adding a host with the ``--labels`` flag:
ceph orch host add my_hostname --labels=my_label1
ceph orch host add my_hostname --labels=my_label1,my_label2
.. prompt:: bash #
To add a label a existing host, run::
ceph orch host add my_hostname --labels=my_label1
ceph orch host add my_hostname --labels=my_label1,my_label2
ceph orch host label add my_hostname my_label
To add a label a existing host, run:
To remove a label, run::
.. prompt:: bash #
ceph orch host label rm my_hostname my_label
ceph orch host label add my_hostname my_label
To remove a label, run:
.. prompt:: bash #
ceph orch host label rm my_hostname my_label
.. _cephadm-special-host-labels:
@ -166,15 +187,39 @@ The following host labels have a special meaning to cephadm. All start with ``_
Maintenance Mode
================
Place a host in and out of maintenance mode (stops all Ceph daemons on host)::
Place a host in and out of maintenance mode (stops all Ceph daemons on host):
ceph orch host maintenance enter <hostname> [--force]
ceph orch host maintenance exit <hostname>
.. prompt:: bash #
ceph orch host maintenance enter <hostname> [--force]
ceph orch host maintenance exit <hostname>
Where the force flag when entering maintenance allows the user to bypass warnings (but not alerts)
See also :ref:`cephadm-fqdn`
Rescanning Host Devices
=======================
Some servers and external enclosures may not register device removal or insertion with the
kernel. In these scenarios, you'll need to perform a host rescan. A rescan is typically
non-disruptive, and can be performed with the following CLI command:
.. prompt:: bash #
ceph orch host rescan <hostname> [--with-summary]
The ``with-summary`` flag provides a breakdown of the number of HBAs found and scanned, together
with any that failed:
.. prompt:: bash [ceph:root@rh9-ceph1/]#
ceph orch host rescan rh9-ceph1 --with-summary
::
Ok. 2 adapters detected: 2 rescanned, 0 skipped, 0 failed (0.32s)
Creating many hosts at once
===========================
@ -241,26 +286,36 @@ connect to remote hosts. When the cluster is bootstrapped, this SSH
key is generated automatically and no additional configuration
is necessary.
A *new* SSH key can be generated with::
A *new* SSH key can be generated with:
ceph cephadm generate-key
.. prompt:: bash #
The public portion of the SSH key can be retrieved with::
ceph cephadm generate-key
ceph cephadm get-pub-key
The public portion of the SSH key can be retrieved with:
The currently stored SSH key can be deleted with::
.. prompt:: bash #
ceph cephadm clear-key
ceph cephadm get-pub-key
You can make use of an existing key by directly importing it with::
The currently stored SSH key can be deleted with:
ceph config-key set mgr/cephadm/ssh_identity_key -i <key>
ceph config-key set mgr/cephadm/ssh_identity_pub -i <pub>
.. prompt:: bash #
You will then need to restart the mgr daemon to reload the configuration with::
ceph cephadm clear-key
ceph mgr fail
You can make use of an existing key by directly importing it with:
.. prompt:: bash #
ceph config-key set mgr/cephadm/ssh_identity_key -i <key>
ceph config-key set mgr/cephadm/ssh_identity_pub -i <pub>
You will then need to restart the mgr daemon to reload the configuration with:
.. prompt:: bash #
ceph mgr fail
.. _cephadm-ssh-user:
@ -272,11 +327,13 @@ that has enough privileges to download container images, start containers
and execute commands without prompting for a password. If you do not want
to use the "root" user (default option in cephadm), you must provide
cephadm the name of the user that is going to be used to perform all the
cephadm operations. Use the command::
cephadm operations. Use the command:
ceph cephadm set-user <user>
.. prompt:: bash #
Prior to running this the cluster ssh key needs to be added to this users
ceph cephadm set-user <user>
Prior to running this the cluster SSH key needs to be added to this users
authorized_keys file and non-root users must have passwordless sudo access.
@ -295,17 +352,23 @@ something like this::
There are two ways to customize this configuration for your environment:
#. Import a customized configuration file that will be stored
by the monitor with::
by the monitor with:
ceph cephadm set-ssh-config -i <ssh_config_file>
.. prompt:: bash #
To remove a customized SSH config and revert back to the default behavior::
ceph cephadm set-ssh-config -i <ssh_config_file>
ceph cephadm clear-ssh-config
To remove a customized SSH config and revert back to the default behavior:
#. You can configure a file location for the SSH configuration file with::
.. prompt:: bash #
ceph config set mgr mgr/cephadm/ssh_config_file <path>
ceph cephadm clear-ssh-config
#. You can configure a file location for the SSH configuration file with:
.. prompt:: bash #
ceph config set mgr mgr/cephadm/ssh_config_file <path>
We do *not recommend* this approach. The path name must be
visible to *any* mgr daemon, and cephadm runs all daemons as
@ -370,4 +433,4 @@ requires the bare host name when adding a host to the cluster:
..
TODO: This chapter needs to provide way for users to configure
Grafana in the dashboard, as this is right no very hard to do.
Grafana in the dashboard, as this is right now very hard to do.

View File

@ -4,21 +4,36 @@
Cephadm
=======
``cephadm`` deploys and manages a Ceph cluster. It does this by connecting the
manager daemon to hosts via SSH. The manager daemon is able to add, remove, and
update Ceph containers. ``cephadm`` does not rely on external configuration
tools such as Ansible, Rook, and Salt.
``cephadm`` is a utility that is used to manage a Ceph cluster.
``cephadm`` manages the full lifecycle of a Ceph cluster. This lifecycle
starts with the bootstrapping process, when ``cephadm`` creates a tiny
Ceph cluster on a single node. This cluster consists of one monitor and
one manager. ``cephadm`` then uses the orchestration interface ("day 2"
commands) to expand the cluster, adding all hosts and provisioning all
Ceph daemons and services. Management of this lifecycle can be performed
either via the Ceph command-line interface (CLI) or via the dashboard (GUI).
Here is a list of some of the things that ``cephadm`` can do:
``cephadm`` is new in Ceph release v15.2.0 (Octopus) and does not support older
versions of Ceph.
- ``cephadm`` can add a Ceph container to the cluster.
- ``cephadm`` can remove a Ceph container from the cluster.
- ``cephadm`` can update Ceph containers.
``cephadm`` does not rely on external configuration tools like Ansible, Rook,
or Salt. However, those external configuration tools can be used to automate
operations not performed by cephadm itself. To learn more about these external
configuration tools, visit their pages:
* https://github.com/ceph/cephadm-ansible
* https://rook.io/docs/rook/v1.10/Getting-Started/intro/
* https://github.com/ceph/ceph-salt
``cephadm`` manages the full lifecycle of a Ceph cluster. This lifecycle starts
with the bootstrapping process, when ``cephadm`` creates a tiny Ceph cluster on
a single node. This cluster consists of one monitor and one manager.
``cephadm`` then uses the orchestration interface to expand the cluster, adding
hosts and provisioning Ceph daemons and services. Management of this lifecycle
can be performed either via the Ceph command-line interface (CLI) or via the
dashboard (GUI).
To use ``cephadm`` to get started with Ceph, follow the instructions in
:ref:`cephadm_deploying_new_cluster`.
``cephadm`` was introduced in Ceph release v15.2.0 (Octopus) and does not
support older versions of Ceph.
.. toctree::
:maxdepth: 2

View File

@ -1,3 +1,5 @@
.. _cephadm_deploying_new_cluster:
============================
Deploying a new Ceph cluster
============================
@ -8,7 +10,6 @@ then deploying the needed services.
.. highlight:: console
.. _cephadm-host-requirements:
Requirements
@ -35,17 +36,12 @@ Ceph.
Install cephadm
===============
The ``cephadm`` command can
#. bootstrap a new cluster
#. launch a containerized shell with a working Ceph CLI
#. aid in debugging containerized Ceph daemons
There are two ways to install ``cephadm``:
#. a :ref:`curl-based installation<cephadm_install_curl>` method
#. :ref:`distribution-specific installation methods<cephadm_install_distros>`
.. _cephadm_install_curl:
curl-based installation
@ -214,8 +210,8 @@ available options.
EOF
$ ./cephadm bootstrap --config initial-ceph.conf ...
* The ``--ssh-user *<user>*`` option makes it possible to choose which ssh
user cephadm will use to connect to hosts. The associated ssh key will be
* The ``--ssh-user *<user>*`` option makes it possible to choose which SSH
user cephadm will use to connect to hosts. The associated SSH key will be
added to ``/home/*<user>*/.ssh/authorized_keys``. The user that you
designate with this option must have passwordless sudo access.
@ -366,38 +362,78 @@ Different deployment scenarios
Single host
-----------
To configure a Ceph cluster to run on a single host, use the ``--single-host-defaults`` flag when bootstrapping. For use cases of this, see :ref:`one-node-cluster`.
To configure a Ceph cluster to run on a single host, use the
``--single-host-defaults`` flag when bootstrapping. For use cases of this, see
:ref:`one-node-cluster`.
The ``--single-host-defaults`` flag sets the following configuration options::
global/osd_crush_choose_leaf_type = 0
global/osd_crush_chooseleaf_type = 0
global/osd_pool_default_size = 2
mgr/mgr_standby_modules = False
For more information on these options, see :ref:`one-node-cluster` and ``mgr_standby_modules`` in :ref:`mgr-administrator-guide`.
For more information on these options, see :ref:`one-node-cluster` and
``mgr_standby_modules`` in :ref:`mgr-administrator-guide`.
.. _cephadm-airgap:
Deployment in an isolated environment
-------------------------------------
You can install Cephadm in an isolated environment by using a custom container registry. You can either configure Podman or Docker to use an insecure registry, or make the registry secure. Ensure your container image is inside the registry and that you have access to all hosts you wish to add to the cluster.
You might need to install cephadm in an environment that is not connected
directly to the internet (such an environment is also called an "isolated
environment"). This can be done if a custom container registry is used. Either
of two kinds of custom container registry can be used in this scenario: (1) a
Podman-based or Docker-based insecure registry, or (2) a secure registry.
Run a local container registry:
The practice of installing software on systems that are not connected directly
to the internet is called "airgapping" and registries that are not connected
directly to the internet are referred to as "airgapped".
.. prompt:: bash #
Make sure that your container image is inside the registry. Make sure that you
have access to all hosts that you plan to add to the cluster.
podman run --privileged -d --name registry -p 5000:5000 -v /var/lib/registry:/var/lib/registry --restart=always registry:2
#. Run a local container registry:
If you are using an insecure registry, configure Podman or Docker with the hostname and port where the registry is running.
.. prompt:: bash #
.. note:: For every host which accesses the local insecure registry, you will need to repeat this step on the host.
podman run --privileged -d --name registry -p 5000:5000 -v /var/lib/registry:/var/lib/registry --restart=always registry:2
Next, push your container image to your local registry.
#. If you are using an insecure registry, configure Podman or Docker with the
hostname and port where the registry is running.
Then run bootstrap using the ``--image`` flag with your container image. For example:
.. note:: You must repeat this step for every host that accesses the local
insecure registry.
.. prompt:: bash #
#. Push your container image to your local registry. Here are some acceptable
kinds of container images:
cephadm --image *<hostname>*:5000/ceph/ceph bootstrap --mon-ip *<mon-ip>*
* Ceph container image. See :ref:`containers`.
* Prometheus container image
* Node exporter container image
* Grafana container image
* Alertmanager container image
#. Create a temporary configuration file to store the names of the monitoring
images. (See :ref:`cephadm_monitoring-images`):
.. prompt:: bash $
cat <<EOF > initial-ceph.conf
::
[mgr]
mgr/cephadm/container_image_prometheus *<hostname>*:5000/prometheus
mgr/cephadm/container_image_node_exporter *<hostname>*:5000/node_exporter
mgr/cephadm/container_image_grafana *<hostname>*:5000/grafana
mgr/cephadm/container_image_alertmanager *<hostname>*:5000/alertmanger
#. Run bootstrap using the ``--image`` flag and pass the name of your
container image as the argument of the image flag. For example:
.. prompt:: bash #
cephadm --image *<hostname>*:5000/ceph/ceph bootstrap --mon-ip *<mon-ip>*
.. _cluster network: ../rados/configuration/network-config-ref#cluster-network

View File

@ -86,7 +86,20 @@ Service Specification
=====================
A *Service Specification* is a data structure that is used to specify the
deployment of services. Here is an example of a service specification in YAML:
deployment of services. In addition to parameters such as `placement` or
`networks`, the user can set initial values of service configuration parameters
by means of the `config` section. For each param/value configuration pair,
cephadm calls the following command to set its value:
.. prompt:: bash #
ceph config set <service-name> <param> <value>
cephadm raises health warnings in case invalid configuration parameters are
found in the spec (`CEPHADM_INVALID_CONFIG_OPTION`) or if any error while
trying to apply the new configuration option(s) (`CEPHADM_FAILED_SET_OPTION`).
Here is an example of a service specification in YAML:
.. code-block:: yaml
@ -97,6 +110,10 @@ deployment of services. Here is an example of a service specification in YAML:
- host1
- host2
- host3
config:
param_1: val_1
...
param_N: val_N
unmanaged: false
networks:
- 192.169.142.0/24
@ -414,7 +431,7 @@ Cephadm supports the deployment of multiple daemons on the same host:
service_type: rgw
placement:
label: rgw
count-per-host: 2
count_per_host: 2
The main reason for deploying multiple daemons per host is an additional
performance benefit for running multiple RGW and MDS daemons on the same host.
@ -501,10 +518,32 @@ a spec like
- host2
- host3
extra_container_args:
- "--cpus=2"
- "--cpus=2"
which would cause each mon daemon to be deployed with `--cpus=2`.
Mounting Files with Extra Container Arguments
---------------------------------------------
A common use case for extra container arguments is to mount additional
files within the container. However, some intuitive formats for doing
so can cause deployment to fail (see https://tracker.ceph.com/issues/57338).
The recommended syntax for mounting a file with extra container arguments is:
.. code-block:: yaml
extra_container_args:
- "-v"
- "/absolute/file/path/on/host:/absolute/file/path/in/container"
For example:
.. code-block:: yaml
extra_container_args:
- "-v"
- "/opt/ceph_cert/host.cert:/etc/grafana/certs/cert_file:ro"
.. _orch-rm:
Removing a Service

View File

@ -103,6 +103,8 @@ example spec file:
spec:
port: 4200
.. _cephadm_monitoring-images:
Using custom images
~~~~~~~~~~~~~~~~~~~
@ -161,6 +163,8 @@ For example, if you had changed the prometheus image
ceph config rm mgr mgr/cephadm/container_image_prometheus
See also :ref:`cephadm-airgap`.
.. _cephadm-overwrite-jinja2-templates:
Using custom configuration files
@ -195,6 +199,7 @@ set``:
- ``services/grafana/ceph-dashboard.yml``
- ``services/grafana/grafana.ini``
- ``services/prometheus/prometheus.yml``
- ``services/prometheus/alerting/custom_alerts.yml``
You can look up the file templates that are currently used by cephadm in
``src/pybind/mgr/cephadm/templates``:
@ -240,6 +245,15 @@ Example
# reconfig the prometheus service
ceph orch reconfig prometheus
.. code-block:: bash
# set additional custom alerting rules for Prometheus
ceph config-key set mgr/cephadm/services/prometheus/alerting/custom_alerts.yml \
-i $PWD/custom_alerts.yml
# Note that custom alerting rules are not parsed by Jinja and hence escaping
# will not be an issue.
Deploying monitoring without cephadm
------------------------------------
@ -282,6 +296,32 @@ Due to performance reasons, monitoring of RBD images is disabled by default. For
:ref:`prometheus-rbd-io-statistics`. If disabled, the overview and details dashboards will stay empty in Grafana
and the metrics will not be visible in Prometheus.
Setting up Prometheus
-----------------------
Setting Prometheus Retention Time
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Cephadm provides the option to set the Prometheus TDSB retention time using
a ``retention_time`` field in the Prometheus service spec. The value defaults
to 15 days (15d). If you would like a different value, such as 1 year (1y) you
can apply a service spec similar to:
.. code-block:: yaml
service_type: prometheus
placement:
count: 1
spec:
retention_time: "1y"
.. note::
If you already had Prometheus daemon(s) deployed before and are updating an
existent spec as opposed to doing a fresh Prometheus deployment, you must also
tell cephadm to redeploy the Prometheus daemon(s) to put this change into effect.
This can be done with a ``ceph orch redeploy prometheus`` command.
Setting up Grafana
------------------

View File

@ -138,6 +138,12 @@ There are a few ways to create new OSDs:
ceph orch daemon add osd host1:/dev/sdb
Advanced OSD creation from specific devices on a specific host:
.. prompt:: bash #
ceph orch daemon add osd host1:data_devices=/dev/sda,/dev/sdb,db_devices=/dev/sdc,osds_per_device=2
* You can use :ref:`drivegroups` to categorize device(s) based on their
properties. This might be useful in forming a clearer picture of which
devices are available to consume. Properties include device type (SSD or
@ -239,6 +245,18 @@ Expected output::
OSDs that are not safe to destroy will be rejected.
.. note::
After removing OSDs, if the drives the OSDs were deployed on once again
become available, cephadm may automatically try to deploy more OSDs
on these drives if they match an existing drivegroup spec. If you deployed
the OSDs you are removing with a spec and don't want any new OSDs deployed on
the drives after removal, it's best to modify the drivegroup spec before removal.
Either set ``unmanaged: true`` to stop it from picking up new drives at all,
or modify it in some way that it no longer matches the drives used for the
OSDs you wish to remove. Then re-apply the spec. For more info on drivegroup
specs see :ref:`drivegroups`. For more info on the declarative nature of
cephadm in reference to deploying OSDs, see :ref:`cephadm-osd-declarative`
Monitoring OSD State
--------------------
@ -283,6 +301,7 @@ Expected output::
This resets the initial state of the OSD and takes it off the removal queue.
.. _cephadm-replacing-an-osd:
Replacing an OSD
----------------

View File

@ -65,14 +65,14 @@ example spec file:
.. code-block:: yaml
service_type: rgw
service_name: foo
service_id: foo
placement:
label: rgw
count-per-host: 2
count_per_host: 2
networks:
- 192.169.142.0/24
spec:
port: 8000
rgw_frontend_port: 8080
Multisite zones
@ -224,6 +224,33 @@ It is a yaml format file with the following properties:
...
-----END PRIVATE KEY-----
.. code-block:: yaml
service_type: ingress
service_id: rgw.something # adjust to match your existing RGW service
placement:
hosts:
- host1
- host2
- host3
spec:
backend_service: rgw.something # adjust to match your existing RGW service
virtual_ips_list:
- <string>/<string> # ex: 192.168.20.1/24
- <string>/<string> # ex: 192.168.20.2/24
- <string>/<string> # ex: 192.168.20.3/24
frontend_port: <integer> # ex: 8080
monitor_port: <integer> # ex: 1967, used by haproxy for load balancer status
virtual_interface_networks: [ ... ] # optional: list of CIDR networks
ssl_cert: | # optional: SSL certificate and key
-----BEGIN CERTIFICATE-----
...
-----END CERTIFICATE-----
-----BEGIN PRIVATE KEY-----
...
-----END PRIVATE KEY-----
where the properties of this service specification are:
* ``service_type``
@ -237,6 +264,10 @@ where the properties of this service specification are:
to match the nodes where RGW is deployed.
* ``virtual_ip``
The virtual IP (and network) in CIDR format where the ingress service will be available.
* ``virtual_ips_list``
The virtual IP address in CIDR format where the ingress service will be available.
Each virtual IP address will be primary on one node running the ingress service. The number
of virtual IP addresses must be less than or equal to the number of ingress nodes.
* ``virtual_interface_networks``
A list of networks to identify which ethernet interface to use for the virtual IP.
* ``frontend_port``

View File

@ -179,7 +179,7 @@ container execution command.
.. _cephadm-ssh-errors:
ssh errors
SSH errors
----------
Error message::
@ -208,7 +208,7 @@ Things users can do:
[root@mon1 ~]# cat ~/cephadm_private_key | cephadm shell -- ceph cephadm set-ssk-key -i -
2. Ensure that the ssh config is correct::
2. Ensure that the SSH config is correct::
[root@mon1 ~]# cephadm shell -- ceph cephadm get-ssh-config > config

View File

@ -96,6 +96,12 @@ You can stop the upgrade process at any time by running the following command:
ceph orch upgrade stop
Post upgrade actions
====================
In case the new version is based on ``cephadm``, once done with the upgrade the user
has to update the ``cephadm`` package (or ceph-common package in case the user
doesn't use ``cephadm shell``) to a version compatible with the new version.
Potential problems
==================
@ -165,3 +171,100 @@ you need. For example, the following command upgrades to a development build:
ceph orch upgrade start --image quay.io/ceph-ci/ceph:recent-git-branch-name
For more information about available container images, see :ref:`containers`.
Staggered Upgrade
=================
Some users may prefer to upgrade components in phases rather than all at once.
The upgrade command, starting in 16.2.11 and 17.2.1 allows parameters
to limit which daemons are upgraded by a single upgrade command. The options in
include ``daemon_types``, ``services``, ``hosts`` and ``limit``. ``daemon_types``
takes a comma-separated list of daemon types and will only upgrade daemons of those
types. ``services`` is mutually exclusive with ``daemon_types``, only takes services
of one type at a time (e.g. can't provide an OSD and RGW service at the same time), and
will only upgrade daemons belonging to those services. ``hosts`` can be combined
with ``daemon_types`` or ``services`` or provided on its own. The ``hosts`` parameter
follows the same format as the command line options for :ref:`orchestrator-cli-placement-spec`.
``limit`` takes an integer > 0 and provides a numerical limit on the number of
daemons cephadm will upgrade. ``limit`` can be combined with any of the other
parameters. For example, if you specify to upgrade daemons of type osd on host
Host1 with ``limit`` set to 3, cephadm will upgrade (up to) 3 osd daemons on
Host1.
Example: specifying daemon types and hosts:
.. prompt:: bash #
ceph orch upgrade start --image <image-name> --daemon-types mgr,mon --hosts host1,host2
Example: specifying services and using limit:
.. prompt:: bash #
ceph orch upgrade start --image <image-name> --services rgw.example1,rgw.example2 --limit 2
.. note::
Cephadm strictly enforces an order to the upgrade of daemons that is still present
in staggered upgrade scenarios. The current upgrade ordering is
``mgr -> mon -> crash -> osd -> mds -> rgw -> rbd-mirror -> cephfs-mirror -> iscsi -> nfs``.
If you specify parameters that would upgrade daemons out of order, the upgrade
command will block and note which daemons will be missed if you proceed.
.. note::
Upgrade commands with limiting parameters will validate the options before beginning the
upgrade, which may require pulling the new container image. Do not be surprised
if the upgrade start command takes a while to return when limiting parameters are provided.
.. note::
In staggered upgrade scenarios (when a limiting parameter is provided) monitoring
stack daemons including Prometheus and node-exporter are refreshed after the Manager
daemons have been upgraded. Do not be surprised if Manager upgrades thus take longer
than expected. Note that the versions of monitoring stack daemons may not change between
Ceph releases, in which case they are only redeployed.
Upgrading to a version that supports staggered upgrade from one that doesn't
----------------------------------------------------------------------------
While upgrading from a version that already supports staggered upgrades the process
simply requires providing the necessary arguments. However, if you wish to upgrade
to a version that supports staggered upgrade from one that does not, there is a
workaround. It requires first manually upgrading the Manager daemons and then passing
the limiting parameters as usual.
.. warning::
Make sure you have multiple running mgr daemons before attempting this procedure.
To start with, determine which Manager is your active one and which are standby. This
can be done in a variety of ways such as looking at the ``ceph -s`` output. Then,
manually upgrade each standby mgr daemon with:
.. prompt:: bash #
ceph orch daemon redeploy mgr.example1.abcdef --image <new-image-name>
.. note::
If you are on a very early version of cephadm (early Octopus) the ``orch daemon redeploy``
command may not have the ``--image`` flag. In that case, you must manually set the
Manager container image ``ceph config set mgr container_image <new-image-name>`` and then
redeploy the Manager ``ceph orch daemon redeploy mgr.example1.abcdef``
At this point, a Manager fail over should allow us to have the active Manager be one
running the new version.
.. prompt:: bash #
ceph mgr fail
Verify the active Manager is now one running the new version. To complete the Manager
upgrading:
.. prompt:: bash #
ceph orch upgrade start --image <new-image-name> --daemon-types mgr
You should now have all your Manager daemons on the new version and be able to
specify the limiting parameters for the rest of the upgrade.

View File

@ -1,3 +1,12 @@
.. _cephfs_add_remote_mds:
.. note::
It is highly recommended to use :doc:`/cephadm/index` or another Ceph
orchestrator for setting up the ceph cluster. Use this approach only if you
are setting up the ceph cluster manually. If one still intends to use the
manual way for deploying MDS daemons, :doc:`/cephadm/services/mds/` can
also be used.
============================
Deploying Metadata Servers
============================
@ -62,7 +71,7 @@ means limiting its cache size.
Adding an MDS
=============
#. Create an mds data point ``/var/lib/ceph/mds/ceph-${id}``. The daemon only uses this directory to store its keyring.
#. Create an mds directory ``/var/lib/ceph/mds/ceph-${id}``. The daemon only uses this directory to store its keyring.
#. Create the authentication key, if you use CephX: ::

View File

@ -1,3 +1,5 @@
.. _ceph-dokan:
=======================
Mount CephFS on Windows
=======================

View File

@ -64,7 +64,7 @@ Copy a file/directory to Ceph File System from Local File System.
Usage :
put [options] <source_path> [target_path]
put [options] <source_path> <target_path>
* source_path - local file/directory path to be copied to cephfs.
* if `.` copies all the file/directories in the local working directory.
@ -84,7 +84,7 @@ Copy a file from Ceph File System to Local File System.
Usage :
get [options] <source_path> [target_path]
get [options] <source_path> <target_path>
* source_path - remote file/directory path which is to be copied to local file system.
* if `.` copies all the file/directories in the remote working directory.

Binary file not shown.

Before

Width:  |  Height:  |  Size: 18 KiB

After

Width:  |  Height:  |  Size: 27 KiB

View File

@ -1,3 +1,5 @@
.. _cephfs-top:
==================
CephFS Top Utility
==================
@ -7,7 +9,7 @@ in realtime. `cephfs-top` is a curses based python script which makes use of `st
plugin in Ceph Manager to fetch (and display) metrics.
Manager Plugin
--------------
==============
Ceph Filesystem clients periodically forward various metrics to Ceph Metadata Servers (MDS)
which in turn get forwarded to Ceph Manager by MDS rank zero. Each active MDS forward its
@ -27,9 +29,8 @@ metrics are for a particular MDS rank (e.g., number of subtrees handled by an MD
Once enabled, Ceph Filesystem metrics can be fetched via::
$ ceph fs perf stats
{"version": 1, "global_counters": ["cap_hit", "read_latency", "write_latency", "metadata_latency", "dentry_lease"], "counters": [], "client_metadata": {"client.614146": {"IP": "10.1.1.100", "hostname" : "ceph-host1", "root": "/", "mount_point": "/mnt/cephfs", "valid_metrics": ["cap_hit", "read_latency", "write_latency", "metadata_latency", "dentry_lease"]}}, "global_metrics": {"client.614146": [[0, 0], [0, 0], [0, 0], [0, 0], [0, 0]]}, "metrics": {"delayed_ranks": [], "mds.0": {"client.614146": []}}}
Details of the JSON command output are as follows:
The output format is JSON and contains fields as follows:
- `version`: Version of stats output
- `global_counters`: List of global performance metrics
@ -52,7 +53,7 @@ To fetch metrics only for a subset of active MDSs (e.g., MDS rank 1 and 2)::
$ ceph fs perf stats --mds_rank=1,2
`cephfs-top`
------------
============
`cephfs-top` utility relies on `stats` plugin to fetch performance metrics and display in
`top(1)` like format. `cephfs-top` is available as part of `cephfs-top` package.
@ -62,6 +63,9 @@ By default, `cephfs-top` uses `client.fstop` user to connect to a Ceph cluster::
$ ceph auth get-or-create client.fstop mon 'allow r' mds 'allow r' osd 'allow r' mgr 'allow r'
$ cephfs-top
Command-Line Options
--------------------
To use a non-default user (other than `client.fstop`) use::
$ cephfs-top --id <name>
@ -76,8 +80,18 @@ By default, `cephfs-top` connects to cluster name `ceph`. To use a non-default c
Interval should be greater or equal to 0.5 second. Fractional seconds are honoured.
Sample screenshot running `cephfs-top` with 2 clients:
Interactive Commands
--------------------
1. m : Filesystem selection
Displays a menu of filesystems for selection.
2. q : Quit
Exit the utility if you are at the home screen (All Filesystem Info),
otherwise escape back to the home screen.
The metrics display can be scrolled using the Arrow Keys, PgUp/PgDn, Home/End and mouse.
Sample screenshot running `cephfs-top` with 2 filesystems:
.. image:: cephfs-top.png
.. note:: As of now, `cephfs-top` does not reliably work with multiple Ceph Filesystems.

View File

@ -80,18 +80,63 @@ List volumes using::
$ ceph fs volume ls
Fetch the information of a CephFS volume using::
$ ceph fs volume info vol_name [--human_readable]
The ``--human_readable`` flag shows used and available pool capacities in KB/MB/GB.
The output format is JSON and contains fields as follows:
* pools: Attributes of data and metadata pools
* avail: The amount of free space available in bytes
* used: The amount of storage consumed in bytes
* name: Name of the pool
* mon_addrs: List of monitor addresses
* used_size: Current used size of the CephFS volume in bytes
* pending_subvolume_deletions: Number of subvolumes pending deletion
Sample output of volume info command::
$ ceph fs volume info vol_name
{
"mon_addrs": [
"192.168.1.7:40977"
],
"pending_subvolume_deletions": 0,
"pools": {
"data": [
{
"avail": 106288709632,
"name": "cephfs.vol_name.data",
"used": 4096
}
],
"metadata": [
{
"avail": 106288709632,
"name": "cephfs.vol_name.meta",
"used": 155648
}
]
},
"used_size": 0
}
FS Subvolume groups
-------------------
Create a subvolume group using::
$ ceph fs subvolumegroup create <vol_name> <group_name> [--pool_layout <data_pool_name>] [--uid <uid>] [--gid <gid>] [--mode <octal_mode>]
$ ceph fs subvolumegroup create <vol_name> <group_name> [--size <size_in_bytes>] [--pool_layout <data_pool_name>] [--uid <uid>] [--gid <gid>] [--mode <octal_mode>]
The command succeeds even if the subvolume group already exists.
When creating a subvolume group you can specify its data pool layout (see
:doc:`/cephfs/file-layouts`), uid, gid, and file mode in octal numerals. By default, the
subvolume group is created with an octal file mode '755', uid '0', gid '0' and data pool
:doc:`/cephfs/file-layouts`), uid, gid, file mode in octal numerals and
size in bytes. The size of the subvolume group is specified by setting
a quota on it (see :doc:`/cephfs/quota`). By default, the subvolume group
is created with an octal file mode '755', uid '0', gid '0' and the data pool
layout of its parent directory.
@ -114,6 +159,47 @@ List subvolume groups using::
.. note:: Subvolume group snapshot feature is no longer supported in mainline CephFS (existing group
snapshots can still be listed and deleted)
Fetch the metadata of a subvolume group using::
$ ceph fs subvolumegroup info <vol_name> <group_name>
The output format is JSON and contains fields as follows.
* atime: access time of subvolume group path in the format "YYYY-MM-DD HH:MM:SS"
* mtime: modification time of subvolume group path in the format "YYYY-MM-DD HH:MM:SS"
* ctime: change time of subvolume group path in the format "YYYY-MM-DD HH:MM:SS"
* uid: uid of subvolume group path
* gid: gid of subvolume group path
* mode: mode of subvolume group path
* mon_addrs: list of monitor addresses
* bytes_pcent: quota used in percentage if quota is set, else displays "undefined"
* bytes_quota: quota size in bytes if quota is set, else displays "infinite"
* bytes_used: current used size of the subvolume group in bytes
* created_at: time of creation of subvolume group in the format "YYYY-MM-DD HH:MM:SS"
* data_pool: data pool the subvolume group belongs to
Check the presence of any subvolume group using::
$ ceph fs subvolumegroup exist <vol_name>
The strings returned by the 'exist' command:
* "subvolumegroup exists": if any subvolumegroup is present
* "no subvolumegroup exists": if no subvolumegroup is present
.. note:: It checks for the presence of custom groups and not the default one. To validate the emptiness of the volume, subvolumegroup existence check alone is not sufficient. The subvolume existence also needs to be checked as there might be subvolumes in the default group.
Resize a subvolume group using::
$ ceph fs subvolumegroup resize <vol_name> <group_name> <new_size> [--no_shrink]
The command resizes the subvolume group quota using the size specified by 'new_size'.
The '--no_shrink' flag prevents the subvolume group to shrink below the current used
size of the subvolume group.
The subvolume group can be resized to an unlimited size by passing 'inf' or 'infinite'
as the new_size.
Remove a snapshot of a subvolume group using::
$ ceph fs subvolumegroup snapshot rm <vol_name> <group_name> <snap_name> [--force]
@ -195,7 +281,7 @@ Fetch the absolute path of a subvolume using::
$ ceph fs subvolume getpath <vol_name> <subvol_name> [--group_name <subvol_group_name>]
Fetch the metadata of a subvolume using::
Fetch the information of a subvolume using::
$ ceph fs subvolume info <vol_name> <subvol_name> [--group_name <subvol_group_name>]
@ -243,6 +329,40 @@ List subvolumes using::
.. note:: subvolumes that are removed but have snapshots retained, are also listed.
Check the presence of any subvolume using::
$ ceph fs subvolume exist <vol_name> [--group_name <subvol_group_name>]
The strings returned by the 'exist' command:
* "subvolume exists": if any subvolume of given group_name is present
* "no subvolume exists": if no subvolume of given group_name is present
Set custom metadata on the subvolume as a key-value pair using::
$ ceph fs subvolume metadata set <vol_name> <subvol_name> <key_name> <value> [--group_name <subvol_group_name>]
.. note:: If the key_name already exists then the old value will get replaced by the new value.
.. note:: key_name and value should be a string of ASCII characters (as specified in python's string.printable). key_name is case-insensitive and always stored in lower case.
.. note:: Custom metadata on a subvolume is not preserved when snapshotting the subvolume, and hence, is also not preserved when cloning the subvolume snapshot.
Get custom metadata set on the subvolume using the metadata key::
$ ceph fs subvolume metadata get <vol_name> <subvol_name> <key_name> [--group_name <subvol_group_name>]
List custom metadata (key-value pairs) set on the subvolume using::
$ ceph fs subvolume metadata ls <vol_name> <subvol_name> [--group_name <subvol_group_name>]
Remove custom metadata set on the subvolume using the metadata key::
$ ceph fs subvolume metadata rm <vol_name> <subvol_name> <key_name> [--group_name <subvol_group_name>] [--force]
Using the '--force' flag allows the command to succeed that would otherwise
fail if the metadata key did not exist.
Create a snapshot of a subvolume using::
$ ceph fs subvolume snapshot create <vol_name> <subvol_name> <snap_name> [--group_name <subvol_group_name>]
@ -261,16 +381,73 @@ List snapshots of a subvolume using::
$ ceph fs subvolume snapshot ls <vol_name> <subvol_name> [--group_name <subvol_group_name>]
Fetch the metadata of a snapshot using::
Fetch the information of a snapshot using::
$ ceph fs subvolume snapshot info <vol_name> <subvol_name> <snap_name> [--group_name <subvol_group_name>]
The output format is json and contains fields as follows.
The output format is JSON and contains fields as follows.
* created_at: time of creation of snapshot in the format "YYYY-MM-DD HH:MM:SS:ffffff"
* data_pool: data pool the snapshot belongs to
* has_pending_clones: "yes" if snapshot clone is in progress otherwise "no"
* size: snapshot size in bytes
* pending_clones: list of in progress or pending clones and their target group if exist otherwise this field is not shown
* orphan_clones_count: count of orphan clones if snapshot has orphan clones otherwise this field is not shown
Sample output when snapshot clones are in progress or pending state::
$ ceph fs subvolume snapshot info cephfs subvol snap
{
"created_at": "2022-06-14 13:54:58.618769",
"data_pool": "cephfs.cephfs.data",
"has_pending_clones": "yes",
"pending_clones": [
{
"name": "clone_1",
"target_group": "target_subvol_group"
},
{
"name": "clone_2"
},
{
"name": "clone_3",
"target_group": "target_subvol_group"
}
]
}
Sample output when no snapshot clone is in progress or pending state::
$ ceph fs subvolume snapshot info cephfs subvol snap
{
"created_at": "2022-06-14 13:54:58.618769",
"data_pool": "cephfs.cephfs.data",
"has_pending_clones": "no"
}
Set custom metadata on the snapshot as a key-value pair using::
$ ceph fs subvolume snapshot metadata set <vol_name> <subvol_name> <snap_name> <key_name> <value> [--group_name <subvol_group_name>]
.. note:: If the key_name already exists then the old value will get replaced by the new value.
.. note:: The key_name and value should be a string of ASCII characters (as specified in python's string.printable). The key_name is case-insensitive and always stored in lower case.
.. note:: Custom metadata on a snapshots is not preserved when snapshotting the subvolume, and hence, is also not preserved when cloning the subvolume snapshot.
Get custom metadata set on the snapshot using the metadata key::
$ ceph fs subvolume snapshot metadata get <vol_name> <subvol_name> <snap_name> <key_name> [--group_name <subvol_group_name>]
List custom metadata (key-value pairs) set on the snapshot using::
$ ceph fs subvolume snapshot metadata ls <vol_name> <subvol_name> <snap_name> [--group_name <subvol_group_name>]
Remove custom metadata set on the snapshot using the metadata key::
$ ceph fs subvolume snapshot metadata rm <vol_name> <subvol_name> <snap_name> <key_name> [--group_name <subvol_group_name>] [--force]
Using the '--force' flag allows the command to succeed that would otherwise
fail if the metadata key did not exist.
Cloning Snapshots
-----------------
@ -323,8 +500,14 @@ A clone can be in one of the following states:
#. `in-progress` : Clone operation is in progress
#. `complete` : Clone operation has successfully finished
#. `failed` : Clone operation has failed
#. `canceled` : Clone operation is cancelled by user
Sample output from an `in-progress` clone operation::
The reason for a clone failure is shown as below:
#. `errno` : error number
#. `error_msg` : failure error string
Sample output of an `in-progress` clone operation::
$ ceph fs subvolume snapshot clone cephfs subvol1 snap1 clone1
$ ceph fs clone status cephfs clone1
@ -339,6 +522,28 @@ Sample output from an `in-progress` clone operation::
}
}
.. note:: The `failure` section will be shown only if the clone is in failed or cancelled state
Sample output of a `failed` clone operation::
$ ceph fs subvolume snapshot clone cephfs subvol1 snap1 clone1
$ ceph fs clone status cephfs clone1
{
"status": {
"state": "failed",
"source": {
"volume": "cephfs",
"subvolume": "subvol1",
"snapshot": "snap1"
"size": "104857600"
},
"failure": {
"errno": "122",
"errstr": "Disk quota exceeded"
}
}
}
(NOTE: since `subvol1` is in default group, `source` section in `clone status` does not include group name)
.. note:: Cloned subvolumes are accessible only after the clone operation has successfully completed.

View File

@ -57,6 +57,7 @@ D0 -> N3 [color=red,penwidth=2.0];
// terminal (but not "in")
node [shape=polygon,sides=6,color=black,peripheries=1];
D1 [label="down:damaged"]
S2 -> D1 [color=black,penwidth=2.0];
N3 -> D1 [color=black,penwidth=2.0];
N4 -> D1 [color=black,penwidth=2.0];
N5 -> D1 [color=black,penwidth=2.0];
@ -69,5 +70,6 @@ D1 -> D0 [color=red,penwidth=2.0]
node [shape=polygon,sides=6,color=purple,peripheries=1];
D3 [label="down:stopped"]
S3 -> D3 [color=purple,penwidth=2.0];
N6 -> D3 [color=purple,penwidth=2.0];
}

View File

@ -31,7 +31,7 @@ POSIX semantics for various reasons:
writes are not coherently propagated to other clients' caches. That
is, if a page is cached on host A, and then updated on host B, host
A's page is not coherently invalidated. (Shared writable mmap
appears to be quite rare--we have yet to here any complaints about this
appears to be quite rare--we have yet to hear any complaints about this
behavior, and implementing cache coherency properly is complex.)
- CephFS clients present a hidden ``.snap`` directory that is used to
access, create, delete, and rename snapshots. Although the virtual
@ -62,17 +62,15 @@ as client data may not even be flushed to the server until the file is
closed (and more generally writes will be significantly more
time-shifted than CephFS, leading to less predictable results).
However, all of there are very close to POSIX, and most of the time
applications don't notice too much. Many other storage systems (e.g.,
HDFS) claim to be "POSIX-like" but diverge significantly from the
standard by dropping support for things like in-place file
modifications, truncate, or directory renames.
Regardless, these are all similar enough to POSIX, and applications still work
most of the time. Many other storage systems (e.g., HDFS) claim to be
"POSIX-like" but diverge significantly from the standard by dropping support
for things like in-place file modifications, truncate, or directory renames.
Bottom line
-----------
CephFS relaxes more than local Linux kernel file systems (e.g., writes
CephFS relaxes more than local Linux kernel file systems (for example, writes
spanning object boundaries may be torn). It relaxes strictly less
than NFS when it comes to multiclient consistency, and generally less
than NFS when it comes to write atomicity.

View File

@ -41,6 +41,17 @@ Limitations
the directory the client is restricted too (e.g., ``/home/user``)
or something nested beneath it.
Kernel clients need access to the parent of the directory inode on
which quotas are configured in order to enforce them. If quota is
configured on a directory path (e.g., ``/home/volumes/group``), the
kclient needs to have access to the parent (e.g., ``/home/volumes``).
An example command to create such an user is as below::
$ ceph auth get-or-create client.guest mds 'allow r path=/home/volumes, allow rw path=/home/volumes/group' mgr 'allow rw' osd 'allow rw tag cephfs metadata=*' mon 'allow r'
See also: https://tracker.ceph.com/issues/55090
#. *Snapshot file data which has since been deleted or changed does not count
towards the quota.* See also: http://tracker.ceph.com/issues/24284

View File

@ -1,3 +1,5 @@
.. _snap-schedule:
==========================
Snapshot Scheduling Module
==========================
@ -47,10 +49,9 @@ The following time periods are recognized: `h(our), d(ay), w(eek), m(onth),
y(ear)` and `n`. The latter is a special modifier where e.g. `10n` means keep
the last 10 snapshots regardless of timing,
All subcommands take optional `fs` and `subvol` arguments to specify paths in
All subcommands take optional `fs` argument to specify paths in
multi-fs setups and :doc:`/cephfs/fs-volumes` managed setups. If not
passed `fs` defaults to the first file system listed in the fs_map, `subvolume`
defaults to nothing.
passed `fs` defaults to the first file system listed in the fs_map.
When using :doc:`/cephfs/fs-volumes` the argument `fs` is equivalent to a
`volume`.
@ -64,16 +65,21 @@ When no subcommand is supplied a synopsis is printed::
#> ceph fs snap-schedule
no valid command found; 8 closest matches:
fs snap-schedule status [<path>] [<subvol>] [<fs>] [<format>]
fs snap-schedule list <path> [<subvol>] [--recursive] [<fs>] [<format>]
fs snap-schedule add <path> <snap_schedule> [<start>] [<fs>] [<subvol>]
fs snap-schedule remove <path> [<repeat>] [<start>] [<subvol>] [<fs>]
fs snap-schedule retention add <path> <retention_spec_or_period> [<retention_count>] [<fs>] [<subvol>]
fs snap-schedule retention remove <path> <retention_spec_or_period> [<retention_count>] [<fs>] [<subvol>]
fs snap-schedule activate <path> [<repeat>] [<start>] [<subvol>] [<fs>]
fs snap-schedule deactivate <path> [<repeat>] [<start>] [<subvol>] [<fs>]
fs snap-schedule status [<path>] [<fs>] [<format>]
fs snap-schedule list <path> [--recursive] [<fs>] [<format>]
fs snap-schedule add <path> <snap_schedule> [<start>] [<fs>]
fs snap-schedule remove <path> [<repeat>] [<start>] [<fs>]
fs snap-schedule retention add <path> <retention_spec_or_period> [<retention_count>] [<fs>]
fs snap-schedule retention remove <path> <retention_spec_or_period> [<retention_count>] [<fs>]
fs snap-schedule activate <path> [<repeat>] [<start>] [<fs>]
fs snap-schedule deactivate <path> [<repeat>] [<start>] [<fs>]
Error EINVAL: invalid command
Note:
^^^^^
A `subvolume` argument is no longer accepted by the commands.
Inspect snapshot schedules
--------------------------

View File

@ -109,6 +109,9 @@ extensions = [
ditaa = shutil.which("ditaa")
if ditaa is not None:
# in case we don't have binfmt_misc enabled or jar is not registered
ditaa_args = ['-jar', ditaa]
ditaa = 'java'
extensions += ['sphinxcontrib.ditaa']
else:
extensions += ['plantweb.directive']

View File

@ -32,8 +32,8 @@ The ceph orch command will be extended to support maintenance.
.. code-block::
ceph orch host enter-maintenance <host> [ --check ]
ceph orch host exit-maintenance <host>
ceph orch host maintenance enter <host> [ --force ]
ceph orch host maintenance exit <host>
.. note:: In addition, the host's status should be updated to reflect whether it
is in maintenance or not.

View File

@ -0,0 +1,285 @@
Continuous Integration Architecture
===================================
In Ceph, we rely on multiple CI pipelines in our development. Most of these pipelines
are centered around Jenkins. And their configurations are generated using `Jenkins Job Builder`_.
.. _Jenkins Job Builder: https://docs.openstack.org/infra/jenkins-job-builder/
Let's take the ``make check`` performed by Jenkins as an example.
ceph-pull-requests
------------------
``ceph-pull-requests`` is a jenkins job which gets triggered by a GitHub pull
request or a trigger phrase like::
jenkins test make check
There are multiple parties involved in this jenkins job:
.. graphviz::
digraph {
rankdir="LR";
github [
label="<git> git_repo | <webhooks> webhooks | <api> api";
shape=record;
href="https://github.com/ceph/ceph";
];
subgraph cluster_lab {
label="Sepia Lab";
href="https://wiki.sepia.ceph.com/doku.php";
shape=circle;
apt_mirror [
href="http://apt-mirror.front.sepia.ceph.com";
];
shaman [
href="https://shaman.ceph.com";
];
chacra [
peripheries=3;
href="https://chacra.ceph.com";
];
subgraph cluster_jenkins {
label="jenkins";
href="https://jenkins.ceph.com";
jenkins_controller [ label = "controller" ];
jenkins_agents [ label = "agents", peripheries=3 ];
};
};
{
rank=same;
package_repos [ peripheries=3 ];
pypi;
npm;
}
github:webhooks -> jenkins_controller [ label = "notify", color = "crimson" ];
jenkins_controller -> jenkins_agents [ label = "schedule jobs" ];
jenkins_agents -> github:git [ label = "git pull" ];
jenkins_agents -> shaman [ label = "query for chacra repo URL" ];
jenkins_agents -> chacra [ label = "pull build dependencies" ];
jenkins_agents -> package_repos [ label = "pull build dependencies" ];
jenkins_agents -> pypi [ label = "pull Python packages" ];
jenkins_agents -> npm [ label = "pull JavaScript packages" ];
jenkins_agents -> apt_mirror [ label = "pull build dependencies" ];
jenkins_agents -> github:api [ label = "update", color = "crimson" ];
}
Where
Sepia Lab
`Sepia Lab`_ is a test lab used by the Ceph project. This lab offers
the storage and computing resources required by our CI infra.
Jenkins agents
are a set of machines which perform the CI jobs. In this case, they
#. pull the git repo from GitHub and
#. rebase the pull request against the latest master
#. set necessary environment variables
#. run ``run-make-check.sh``
Chacra
is a server offering RESTful API allowing the clients to store and
retrieve binary packages. It also creates the repo for uploaded
packages automatically. Once a certain repo is created on chacra, the
configured shaman server is updated as well, then we can query shaman
for the corresponding repo address. Chacra not only hosts Ceph packages,
it also hosts quite a few other packages like various build dependencies.
Shaman
is a server offering RESTful API allowing the clients to query the
information of repos hosted by chacra nodes. Shaman is also known
for its `Web UI`_. But please note, shaman does not build the
packages, it just offers information on the builds.
As the following shows, `chacra`_ manages multiple projects whose metadata
are stored in a database. These metadata are exposed via Shaman as a web
service. `chacractl`_ is a utility to interact with the `chacra`_ service.
.. graphviz::
digraph {
libboost [
shape=cylinder;
];
libzbd [
shape=cylinder;
];
other_repos [
label="...";
shape=cylinder;
];
postgresql [
shape=cylinder;
style=filled;
]
shaman -> postgresql;
chacra -> postgresql;
chacractl -> chacra;
chacra -> libboost;
chacra -> libzbd;
chacra -> other_repos;
}
.. _Sepia Lab: https://wiki.sepia.ceph.com/doku.php
.. _Web UI: https://shaman.ceph.com
build dependencies
------------------
Just like lots of other software projects, Ceph has both build-time and
run-time dependencies. Most of time, we are inclined to use the packages
prebuilt by the distro. But there are cases where
- the necessary dependencies are either missing in the distro, or
- their versions are too old, or
- they are packaged without some important feature enabled.
- we want to ensure that the version of a certain runtime dependency is
identical to the one we tested in our lab.
No matter what the reason is, we either need to build them from source, or
to package them as binary packages instead of using the ones shipped by the
distro. Quite a few build-time dependencies are included as git submodules,
but in order to avoid rebuilding these dependencies repeatedly, we pre-built
some of them and uploaded them to our own repos. So, when performing
``make check``, the building hosts in our CI just pull them from our internal
repos hosting these packages instead of building them.
So far, following packages are prebuilt for ubuntu focal, and then uploaded to
`chacra`_:
libboost
packages `boost`_. The packages' names are changed from ``libboost-*`` to
``ceph-libboost-*``, and they are instead installed into ``/opt/ceph``, so
they don't interfere with the official ``libboost`` packages shipped by
distro. Its build scripts are hosted at https://github.com/ceph/ceph-boost.
See https://github.com/ceph/ceph-boost/commit/2a8ae02932b2a1fd6a68072da8ca0df2b99b805c
for an example of how to bump the version number. The commands used to
build 1.79 on a vanilla Ubuntu Focal OS are below.
.. prompt:: bash $
sudo apt install debhelper dctrl-tools chrpath libbz2-dev libicu-dev bison \
flex docbook-to-man help2man xsltproc doxygen dh-python python3-all-dev graphviz
wget http://download.ceph.com/qa/boost_1_79_0.tar.bz2
git clone https://github.com/ceph/ceph-boost
tar xjf boost_1_79_0.tar.bz2
cp -ra ceph-boost/debian boost_1_79_0/
pushd boost_1_79_0
export DEB_BUILD_OPTIONS='parallel=6 nodoc'
dpkg-buildpackage -us -uc -b
popd
BOOST_SHA=$(git ls-remote https://github.com/ceph/ceph-boost main | awk '{ print $1 }')
ls *.deb | chacractl binary create \
libboost/master/$BOOST_SHA/ubuntu/focal/amd64/flavors/default
libzbd
packages `libzbd`_ . The upstream libzbd includes debian packaging already.
libpmem
packages `pmdk`_ . Please note, ``ndctl`` is one of the build dependencies of
pmdk, for an updated debian packaging, please see
https://github.com/ceph/ceph-ndctl .
.. note::
please ensure that the package version and the release number of the
packaging are properly updated when updating/upgrading the packaging,
otherwise it would be difficult to tell which version of the package
is installed. We check the package version before trying to upgrade
it in ``install-deps.sh``.
.. _boost: https://www.boost.org
.. _libzbd: https://github.com/westerndigitalcorporation/libzbd
.. _pmdk: https://github.com/pmem/pmdk
But in addition to these libraries, ``ceph-mgr-dashboard``'s frontend uses lots of
JavaScript packages. Quite a few of them are not packaged by distros. Not to
mention the trouble of testing different combination of versions of these
packages. So we decided to include these JavaScript packages in our dist tarball
using ``make-dist``.
Also, because our downstream might not want to use the prepackaged binaries when
redistributing the precompiled Ceph packages, we also need to include these
libraries in our dist tarball. They are
- boost
- liburing
- pmdk
``make-dist`` is a script used by our CI pipeline to create dist tarball so the
tarball can be used to build the Ceph packages in a clean room environment. When
we need to upgrade these third party libraries, we should
- update the CMake script
- rebuild the prebuilt packages and
- update this script to reflect the change.
Uploading Dependencies
----------------------
To ensure that prebuilt packages are available by the jenkins agents, we need to
upload them to either ``apt-mirror.front.sepia.ceph.com`` or `chacra`_. To upload
packages to the former would require the help of our lab administrator, so if we
want to maintain the package repositories on regular basis, a better choice would be
to manage them using `chacractl`_. `chacra`_ represents packages repositories using
a resource hierarchy, like::
<project>/<branch>/<ref>/<distro>/<distro-version>/<arch>
In which:
project
in general, it is used for denoting a set of related packages. For instance,
``libboost``.
branch
branch of project. This mirrors the concept of a Git repo.
ref
a unique id of a given version of a set packages. This id is used to reference
the set packages under the ``<project>/<branch>``. It is a good practice to
version the packaging recipes, like the ``debian`` directory for building DEB
packages and the ``spec`` for building RPM packages, and use the SHA1 of the
packaging recipe for the ``ref``. But you could also use a random string for
``ref``, like the tag name of the built source tree.
distro
the distro name for which the packages are built. Currently, following distros are
supported:
- centos
- debian
- fedora
- rhel
- ubuntu
distro-version
the version of the distro. For instance, if a package is built on ubuntu focal,
the ``distro-version`` should be ``20.04``.
arch
the architecture of the packages. It could be:
- arm64
- amd64
- noarch
So, for example, we can upload the prebuilt boost packages to chacra like
.. prompt:: bash $
ls *.deb | chacractl binary create \
libboost/master/099c0fd56b4a54457e288a2eff8fffdc0d416f7a/ubuntu/focal/amd64/flavors/default
.. _chacra: https://github.com/ceph/chacra
.. _chacractl: https://github.com/ceph/chacractl
Update ``install-deps.sh``
--------------------------
We also need to update ``install-deps.sh`` to point the built script to the new
repo. Please refer to the `script <https://github.com/ceph/ceph/blob/master/install-deps.sh>`_,
for more details.

View File

@ -1,151 +0,0 @@
===============
Deduplication
===============
Introduction
============
Applying data deduplication on an existing software stack is not easy
due to additional metadata management and original data processing
procedure.
In a typical deduplication system, the input source as a data
object is split into multiple chunks by a chunking algorithm.
The deduplication system then compares each chunk with
the existing data chunks, stored in the storage previously.
To this end, a fingerprint index that stores the hash value
of each chunk is employed by the deduplication system
in order to easily find the existing chunks by comparing
hash value rather than searching all contents that reside in
the underlying storage.
There are many challenges in order to implement deduplication on top
of Ceph. Among them, two issues are essential for deduplication.
First is managing scalability of fingerprint index; Second is
it is complex to ensure compatibility between newly applied
deduplication metadata and existing metadata.
Key Idea
========
1. Content hashing (Double hashing): Each client can find an object data
for an object ID using CRUSH. With CRUSH, a client knows object's location
in Base tier.
By hashing object's content at Base tier, a new OID (chunk ID) is generated.
Chunk tier stores in the new OID that has a partial content of original object.
Client 1 -> OID=1 -> HASH(1's content)=K -> OID=K ->
CRUSH(K) -> chunk's location
2. Self-contained object: The external metadata design
makes difficult for integration with storage feature support
since existing storage features cannot recognize the
additional external data structures. If we can design data
deduplication system without any external component, the
original storage features can be reused.
More details in https://ieeexplore.ieee.org/document/8416369
Design
======
.. ditaa::
+-------------+
| Ceph Client |
+------+------+
^
Tiering is |
Transparent | Metadata
to Ceph | +---------------+
Client Ops | | |
| +----->+ Base Pool |
| | | |
| | +-----+---+-----+
| | | ^
v v | | Dedup metadata in Base Pool
+------+----+--+ | | (Dedup metadata contains chunk offsets
| Objecter | | | and fingerprints)
+-----------+--+ | |
^ | | Data in Chunk Pool
| v |
| +-----+---+-----+
| | |
+----->| Chunk Pool |
| |
+---------------+
Data
Pool-based object management:
We define two pools.
The metadata pool stores metadata objects and the chunk pool stores
chunk objects. Since these two pools are divided based on
the purpose and usage, each pool can be managed more
efficiently according to its different characteristics. Base
pool and the chunk pool can separately select a redundancy
scheme between replication and erasure coding depending on
its usage and each pool can be placed in a different storage
location depending on the required performance.
Regarding how to use, please see ``osd_internals/manifest.rst``
Usage Patterns
==============
The different Ceph interface layers present potentially different oportunities
and costs for deduplication and tiering in general.
RadosGW
-------
S3 big data workloads seem like a good opportunity for deduplication. These
objects tend to be write once, read mostly objects which don't see partial
overwrites. As such, it makes sense to fingerprint and dedup up front.
Unlike cephfs and rbd, radosgw has a system for storing
explicit metadata in the head object of a logical s3 object for
locating the remaining pieces. As such, radosgw could use the
refcounting machinery (``osd_internals/refcount.rst``) directly without
needing direct support from rados for manifests.
RBD/Cephfs
----------
RBD and CephFS both use deterministic naming schemes to partition
block devices/file data over rados objects. As such, the redirection
metadata would need to be included as part of rados, presumably
transparently.
Moreover, unlike radosgw, rbd/cephfs rados objects can see overwrites.
For those objects, we don't really want to perform dedup, and we don't
want to pay a write latency penalty in the hot path to do so anyway.
As such, performing tiering and dedup on cold objects in the background
is likely to be preferred.
One important wrinkle, however, is that both rbd and cephfs workloads
often feature usage of snapshots. This means that the rados manifest
support needs robust support for snapshots.
RADOS Machinery
===============
For more information on rados redirect/chunk/dedup support, see ``osd_internals/manifest.rst``.
For more information on rados refcount support, see ``osd_internals/refcount.rst``.
Status and Future Work
======================
At the moment, there exists some preliminary support for manifest
objects within the OSD as well as a dedup tool.
RadosGW data warehouse workloads probably represent the largest
opportunity for this feature, so the first priority is probably to add
direct support for fingerprinting and redirects into the refcount pool
to radosgw.
Aside from radosgw, completing work on manifest object support in the
OSD particularly as it relates to snapshots would be the next step for
rbd and cephfs workloads.

View File

@ -2,11 +2,12 @@
CephFS delayed deletion
=========================
When you delete a file, the data is not immediately removed. Each
object in the file needs to be removed independently, and sending
``size_of_file / stripe_size * replication_count`` messages would slow
the client down too much, and use a too much of the clients
bandwidth. Additionally, snapshots may mean some objects should not be
deleted.
The deletion of a file does not immediately remove its data. Each of the file's
underlying objects must be removed independently. If these objects were removed
immediately, the client would have to send ``size_of_file / stripe_size *
replication_count`` messages. This would consume significant bandwith and would
slow the client unacceptably. If snapshots exist, their existence can prevent
the deletion of objects associated with them.
Instead, the file is marked as deleted on the MDS, and deleted lazily.
In these cases, such files are (1) marked as deleted on the MDS and (2) deleted
lazily.

View File

@ -1,3 +1,5 @@
.. _basic workflow dev guide:
Basic Workflow
==============
@ -8,7 +10,7 @@ The following chart illustrates the basic Ceph development workflow:
Upstream Code Your Local Environment
/----------\ git clone /-------------\
| Ceph | -------------------------> | ceph/master |
| Ceph | -------------------------> | ceph/main |
\----------/ \-------------/
^ |
| | git branch fix_1
@ -29,50 +31,79 @@ The following chart illustrates the basic Ceph development workflow:
\--------------/
This page assumes that you are a new contributor with an idea for a bugfix or
enhancement, but do not know how to proceed. Watch the `Getting Started with
Ceph Development <https://www.youtube.com/watch?v=t5UIehZ1oLs>`_ video for a
practical summary of this workflow.
an enhancement, but you do not know how to proceed. Watch the `Getting Started
with Ceph Development <https://www.youtube.com/watch?v=t5UIehZ1oLs>`_ video for
a practical summary of this workflow.
Updating the tracker
--------------------
Before you start, you should know the :ref:`issue-tracker` (Redmine) number
of the bug you intend to fix. If there is no tracker issue, now is the time to
create one for code changes. Straightforward documentation cleanup does
not necessarily require a corresponding tracker issue. However, an issue
(ticket) should be created if one is adding new documentation chapters or
files, or for other substantial changes.
Find the :ref:`issue-tracker` (Redmine) number of the bug you intend to fix. If
no tracker issue exists, create one. There is only one case in which you do not
have to create a Redmine tracker issue: the case of minor documentation changes.
The tracker ticket serves to explain the issue (bug) to your fellow Ceph
developers and keep them informed as you make progress toward resolution. To
this end, please provide a descriptive title and write appropriate information
and details into the description. When composing the ticket's title, consider "If I
want to search for this ticket two years from now, what keywords will I search
for?"
Simple documentation cleanup does not require a corresponding tracker issue.
Major documenatation changes do require a tracker issue. Major documentation
changes include adding new documentation chapters or files, and making
substantial changes to the structure or content of the documentation.
If you have sufficient tracker permissions, assign the bug to yourself by
setting the ``Assignee`` field. If your tracker permissions have not been
elevated, simply add a comment with a short message like "I am working on this
issue".
A (Redmine) tracker ticket explains the issue (bug) to other Ceph developers to
keep them informed as the bug nears resolution. Provide a useful, clear title
and include detailed information in the description. When composing the title
of the ticket, ask yourself "If I need to search for this ticket two years from
now, which keywords am I likely to search for?" Then include those keywords in
the title.
Forking and Cloning the Ceph Repository
---------------------------------------
If your tracker permissions are elevated, assign the bug to yourself by setting
the ``Assignee`` field. If your tracker permissions have not been elevated,
just add a comment with a short message that says "I am working on this issue".
This section, and the ones that follow, correspond to nodes in the above chart.
Ceph Workflow Overview
----------------------
The upstream code is found at https://github.com/ceph/ceph.git, which is known
as the "upstream repo", or simply "upstream". As the chart shows, we will make
a local copy of this repository, modify it, test our modifications, then submit
the modifications for review and merging.
Three repositories are involved in the Ceph workflow. They are:
A local copy of the upstream code is made by
1. The upstream repository (ceph/ceph)
2. Your fork of the upstream repository (your_github_id/ceph)
3. Your local working copy of the repository (on your workstation)
1. Forking the upstream repo on GitHub, and
2. Cloning your fork to make a local working copy
The procedure for making changes to the Ceph repository is as follows:
#. Configure your local environment
Forking The Ceph Repository
^^^^^^^^^^^^^^^^^^^^^^^^^^^
#. :ref:`Create a fork<forking>` of the "upstream Ceph"
repository.
#. :ref:`Clone the fork<cloning>` to your local filesystem.
#. Fix the bug
#. :ref:`Synchronize local main with upstream main<synchronizing>`.
#. :ref:`Create a bugfix branch<bugfix_branch>` in your local working copy.
#. :ref:`Make alterations to the local working copy of the repository in your
local filesystem<fixing_bug_locally>`.
#. :ref:`Push the changes in your local working copy to your fork<push_changes>`.
#. Create a Pull Request to push the change upstream
#. Create a Pull Request that asks for your changes to be added into the
"upstream Ceph" repository.
Preparing Your Local Working Copy of the Ceph Repository
--------------------------------------------------------
The procedures in this section, "Preparing Your Local Working Copy of the Ceph
Repository", must be followed only when you are first setting up your local
environment. If this is your first time working with the Ceph project, then
these commands are necessary and are the first commands that you should run.
.. _forking:
Creating a Fork of the Ceph Repository
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
See the `GitHub documentation
<https://help.github.com/articles/fork-a-repo/#platform-linux>`_ for
@ -80,148 +111,234 @@ detailed instructions on forking. In short, if your GitHub username is
"mygithubaccount", your fork of the upstream repo will appear at
``https://github.com/mygithubaccount/ceph``.
.. _cloning:
Cloning Your Fork
^^^^^^^^^^^^^^^^^
Once you have created your fork, clone it by running:
After you have created your fork, clone it by running the following command:
.. prompt:: bash $
git clone https://github.com/mygithubaccount/ceph
You must fork the Ceph repository before you clone it. Without forking, you cannot
open a `GitHub pull request
You must fork the Ceph repository before you clone it. If you fail to fork,
you cannot open a `GitHub pull request
<https://docs.github.com/en/free-pro-team@latest/github/collaborating-with-issues-and-pull-requests/creating-a-pull-request>`_.
For more information on using GitHub, refer to `GitHub Help
<https://help.github.com/>`_.
Configuring Your Local Environment
----------------------------------
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
In the local environment created in the previous step, you now have a copy of
the ``master`` branch in ``remotes/origin/master``. This fork
(https://github.com/mygithubaccount/ceph.git) is frozen in time and the
upstream repo (https://github.com/ceph/ceph.git, typically abbreviated to
``ceph/ceph.git``) is updated frequently by other contributors. This means that
you must sync your fork periodically. Failure to synchronize your fork may
result in your commits and pull requests failing to merge because they refer to
file contents that have changed since you last synchronized your fork.
The commands in this section configure your local git environment so that it
generates "Signed-off-by:" tags. These commands also set up your local
environment so that it can stay synchronized with the upstream repository.
Configure your local git environment with your name and email address.
These commands are necessary only during the initial setup of your local
working copy. Another way to say that is "These commands are necessary
only the first time that you are working with the Ceph repository. They are,
however, unavoidable, and if you fail to run them then you will not be able
to work on the Ceph repository.".
.. prompt:: bash $
1. Configure your local git environment with your name and email address.
git config user.name "FIRST_NAME LAST_NAME"
git config user.email "MY_NAME@example.com"
.. note::
These commands will work only from within the ``ceph/`` directory
that was created when you cloned your fork.
Add the upstream repo as a "remote" and fetch it:
.. prompt:: bash $
.. prompt:: bash $
git config user.name "FIRST_NAME LAST_NAME"
git config user.email "MY_NAME@example.com"
git remote add ceph https://github.com/ceph/ceph.git
git fetch ceph
2. Add the upstream repo as a "remote" and fetch it:
Fetching is a process that downloads all objects (commits, branches) that have
been added since the last sync. These commands download all the branches from
``ceph/ceph.git`` to the local git repo as ``remotes/ceph/$BRANCH_NAME`` and
can be referenced as ``ceph/$BRANCH_NAME`` in local git commands.
.. prompt:: bash $
git remote add ceph https://github.com/ceph/ceph.git
git fetch ceph
Resetting Local Master to Upstream Master
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
These commands fetch all the branches and commits from ``ceph/ceph.git`` to
the local git repo as ``remotes/ceph/$BRANCH_NAME`` and can be referenced as
``ceph/$BRANCH_NAME`` in local git commands.
Your local ``master`` branch can be reset to the upstream Ceph ``master``
branch by running the following commands:
Fixing the Bug
--------------
.. _synchronizing:
Synchronizing Local Main with Upstream Main
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
In your local working copy, there is a copy of the ``main`` branch in
``remotes/origin/main``. This is called "local main". This copy of the
main branch (https://github.com/your_github_id/ceph.git) is "frozen in time"
at the moment that you cloned it, but the upstream repo
(https://github.com/ceph/ceph.git, typically abbreviated to ``ceph/ceph.git``)
that it was forked from is not frozen in time: the upstream repo is still being
updated by other contributors.
Because upstream main is continually receiving updates from other
contributors, your fork will drift farther and farther from the state of the
upstream repo when you cloned it.
Keep your fork's ``main`` branch synchronized with upstream main to reduce drift
between your fork's main branch and the upstream main branch.
Here are the commands for keeping your fork synchronized with the
upstream repository:
.. prompt:: bash $
git fetch ceph
git checkout master
git reset --hard ceph/master
git push -u origin master
git checkout main
git reset --hard ceph/main
git push -u origin main
This procedure should be followed often, in order to keep your local ``master``
in sync with upstream ``master``.
Follow this procedure often to keep your local ``main`` in sync with upstream
``main``.
If the command ``git status`` returns a line that reads "Untracked files", see
:ref:`the procedure on updating submodules <update-submodules>`.
.. _bugfix_branch:
Creating a Bugfix branch
------------------------
^^^^^^^^^^^^^^^^^^^^^^^^
Create a branch for your bugfix:
.. prompt:: bash $
git checkout master
git checkout main
git checkout -b fix_1
git push -u origin fix_1
This creates a local branch called ``fix_1`` in our GitHub fork. At this point,
the ``fix_1`` branch is identical to the ``master`` branch, but not for long!
You are now ready to modify the code. Be careful to always run `git checkout
master` first, otherwise you may find commits from an unrelated branch mixed
with your new work.
The first command (git checkout main) makes sure that the bugfix branch
"fix_1" is created from the most recent state of the main branch of the
upstream repository.
Fixing the bug locally
----------------------
The second command (git checkout -b fix_1) creates a "bugfix branch" called
"fix_1" in your local working copy of the repository. The changes that you make
in order to fix the bug will be commited to this branch.
In the `Ceph issue tracker <https://tracker.ceph.com>`_, change the status of
the tracker issue to "In progress". This communicates to other Ceph
contributors that you have begun working on a fix, which helps to avoid
duplication of effort. If you don't have permission to change that field, your
previous comment that you are working on the issue is sufficient.
The third command (git push -u origin fix_1) pushes the bugfix branch from
your local working repository to your fork of the upstream repository.
Your fix may be very simple and require only minimal testing. But that's not
likely. It is more likely that the process of fixing your bug will be iterative
and will involve trial and error, as well as skill. An explanation of how to
fix bugs is beyond the scope of this document. Instead, we focus on the
mechanics of the process in the context of the Ceph project.
.. _fixing_bug_locally:
For a detailed discussion of the tools available for validating bugfixes,
see the chapters on testing.
Fixing the bug in the local working copy
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
For now, let us assume that you have finished work on the bugfix, that you have
tested the bugfix , and that you believe that it works. Commit the changes to
your local branch using the ``--signoff`` option (here represented as the `s`
portion of the `-as` flag):
#. **Updating the tracker**
.. prompt:: bash $
In the `Ceph issue tracker <https://tracker.ceph.com>`_, change the status
of the tracker issue to "In progress". This communicates to other Ceph
contributors that you have begun working on a fix, which helps to avoid
duplication of effort. If you don't have permission to change that field,
just comment that you are working on the issue.
git commit -as
#. **Fixing the bug itself**
Push the changes to your fork:
This guide cannot tell you how to fix the bug that you have chosen to fix.
This guide assumes that you know what required improvement, and that you
know what to do to provide that improvement.
.. prompt:: bash $
It might be that your fix is simple and requires only minimal testing. But
that's unlikely. It is more likely that the process of fixing your bug will
be iterative and will involve trial, error, skill, and patience.
For a detailed discussion of the tools available for validating bugfixes,
see the chapters on testing.
Pushing the Fix to Your Fork
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
You have finished work on the bugfix. You have tested the bugfix, and you
believe that it works.
#. Commit the changes to your local working copy.
Commit the changes to the `fix_1` branch of your local working copy by using
the ``--signoff`` option (here represented as the `s` portion of the `-as`
flag):
.. prompt:: bash $
git commit -as
.. _push_changes:
#. Push the changes to your fork:
Push the changes from the `fix_1` branch of your local working copy to the
`fix_1` branch of your fork of the upstream repository:
.. prompt:: bash $
git push origin fix_1
.. note::
In the command ``git push origin fix_1``, ``origin`` is the name of your
fork of the upstream Ceph repository, and can be thought of as a nickname
for ``git@github.com:username/ceph.git``, where ``username`` is your
GitHub username.
It is possible that ``origin`` is not the name of your fork. Discover the
name of your fork by running ``git remote -v``, as shown here:
.. code-block:: bash
$ git remote -v
ceph https://github.com/ceph/ceph.git (fetch)
ceph https://github.com/ceph/ceph.git (push)
origin git@github.com:username/ceph.git (fetch)
origin git@github.com:username/ceph.git (push)
The line::
origin git@github.com:username/ceph.git (fetch)
and the line::
origin git@github.com:username/ceph.git (push)
provide the information that "origin" is the name of your fork of the
Ceph repository.
git push origin fix_1
Opening a GitHub pull request
-----------------------------
The next step is to open a GitHub pull request (PR). This makes your bugfix
visible to the community of Ceph contributors. They will review it and may
perform additional testing and / or request changes.
After you have pushed the bugfix to your fork, open a GitHub pull request
(PR). This makes your bugfix visible to the community of Ceph contributors.
They will review it. They may perform additional testing on your bugfix, and
they might request changes to the bugfix.
This is the point where you "go public" with your modifications. Be prepared
to receive suggestions and constructive criticism in the form of comments
within the PR. Don't worry! The Ceph project is a friendly place!
Be prepared to receive suggestions and constructive criticism in the form of
comments within the PR.
If you are uncertain how to create and manage pull requests, you may read
`this GitHub pull request tutorial`_.
If you don't know how to create and manage pull requests, read `this GitHub
pull request tutorial`_.
.. _`this GitHub pull request tutorial`:
https://help.github.com/articles/using-pull-requests/
For ideas on what constitutes a "good" pull request, see
To learn what constitutes a "good" pull request, see
the `Git Commit Good Practice`_ article at the `OpenStack Project Wiki`_.
.. _`Git Commit Good Practice`: https://wiki.openstack.org/wiki/GitCommitMessages
.. _`OpenStack Project Wiki`: https://wiki.openstack.org/wiki/Main_Page
and our own `Submitting Patches <https://github.com/ceph/ceph/blob/master/SubmittingPatches.rst>`_ document.
See also our own `Submitting Patches
<https://github.com/ceph/ceph/blob/main/SubmittingPatches.rst>`_ document.
Once your pull request (PR) is opened, update the :ref:`issue-tracker` by
adding a comment directing other contributors to your PR. The comment can be
as simple as::
After your pull request (PR) has been opened, update the :ref:`issue-tracker`
by adding a comment directing other contributors to your PR. The comment can be
as simple as this::
*PR*: https://github.com/ceph/ceph/pull/$NUMBER_OF_YOUR_PULL_REQUEST
@ -230,24 +347,24 @@ Understanding Automated PR validation
When you create or update your PR, the Ceph project's `Continuous Integration
(CI) <https://en.wikipedia.org/wiki/Continuous_integration>`_ infrastructure
automatically tests it. At the time of this writing (September 2020), the
automated CI testing included five tests:
automatically tests it. At the time of this writing (May 2022), the automated
CI testing included many tests. These five are among them:
#. a test to check that the commits are properly signed (see :ref:`submitting-patches`):
#. a test to check that the documentation builds
#. a test to check that the submodules are unmodified
#. a test to check that the API is in order
#. a :ref:`make check<make-check>` test
Additional tests may be performed depending on which files your PR modifies.
#. a :ref:`make check<make-check>` test
The :ref:`make check<make-check>` test builds the PR and runs it through a battery of
tests. These tests run on servers operated by the Ceph Continuous
Integration (CI) team. When the tests complete, the result will be shown
on GitHub in the pull request itself.
Additional tests may be run depending on which files your PR modifies.
You should test your modifications before you open a PR.
Refer to the chapters on testing for details.
The :ref:`make check<make-check>` test builds the PR and runs it through a
battery of tests. These tests run on servers that are operated by the Ceph
Continuous Integration (CI) team. When the tests have completed their run, the
result is shown on GitHub in the pull request itself.
Test your modifications before you open a PR. Refer to the chapters
on testing for details.
Notes on PR make check test
^^^^^^^^^^^^^^^^^^^^^^^^^^^
@ -255,26 +372,28 @@ Notes on PR make check test
The GitHub :ref:`make check<make-check>` test is driven by a Jenkins instance.
Jenkins merges your PR branch into the latest version of the base branch before
starting tests. This means that you don't have to rebase the PR to pick up any fixes.
it starts any tests. This means that you don't have to rebase the PR in order
to pick up any fixes.
You can trigger PR tests at any time by adding a comment to the PR - the
comment should contain the string "test this please". Since a human subscribed
to the PR might interpret that as a request for him or her to test the PR, we
recommend that you address Jenkins directly. For example, write "jenkins retest
this please". For efficiency a single re-test can also be requested with
e.g. "jenkins test signed". For reference, a list of these requests is
automatically added to the end of each new PR's description.
comment should contain the string "test this please". Since a human who is
subscribed to the PR might interpret that as a request for him or her to test
the PR, you must address Jenkins directly. For example, write "jenkins retest
this please". If you need to run only one of the tests, you can request it with
a command like "jenkins test signed". A list of these requests is automatically
added to the end of each new PR's description, so check there to find the
single test you need.
If there is a build failure and you aren't sure what caused it, check the
:ref:`make check<make-check>` log. To access it, click on the "details" (next
to the :ref:`make check<make-check>` test in the PR) link to enter the Jenkins web
GUI. Then click on "Console Output" (on the left).
:ref:`make check<make-check>` log. To access the make check log, click the
"details" (next to the :ref:`make check<make-check>` test in the PR) link to
enter the Jenkins web GUI. Then click "Console Output" (on the left).
Jenkins is configured to search logs for strings known to have been associated
with :ref:`make check<make-check>` failures in the past. However, there is no
guarantee that these known strings are associated with any given
:ref:`make check<make-check>` failure. You'll have to read through the log to determine the
cause of your specific failure.
Jenkins is configured to search logs for strings that are known to have been
associated with :ref:`make check<make-check>` failures in the past. However,
there is no guarantee that these known strings are associated with any given
:ref:`make check<make-check>` failure. You'll have to read through the log to
determine the cause of your specific failure.
Integration tests AKA ceph-qa-suite
-----------------------------------
@ -284,7 +403,7 @@ see how it behaves on real clusters running on physical or virtual
hardware. Tests designed for this purpose live in the `ceph/qa
sub-directory`_ and are run via the `teuthology framework`_.
.. _`ceph/qa sub-directory`: https://github.com/ceph/ceph/tree/master/qa/
.. _`ceph/qa sub-directory`: https://github.com/ceph/ceph/tree/main/qa/
.. _`teuthology repository`: https://github.com/ceph/teuthology
.. _`teuthology framework`: https://github.com/ceph/teuthology
@ -329,7 +448,7 @@ will need to force push your branch with:
git push --force origin fix_1
Why do we take these extra steps instead of simply adding additional commits
the the PR? It is best practice for a PR to consist of a single commit; this
the PR? It is best practice for a PR to consist of a single commit; this
makes for clean history, eases peer review of your changes, and facilitates
merges. In rare circumstances it also makes it easier to cleanly revert
changes.
@ -388,7 +507,7 @@ Another method of generating merge commits involves using Patrick Donnelly's
**/ceph/src/script/ptl-tool.py**. Merge commits that have been generated by
the **ptl-tool** have the following form::
Merge PR #36257 into master
Merge PR #36257 into main
* refs/pull/36257/head:
client: move client_lock to _unmount()
client: add timer_lock support

View File

@ -139,8 +139,46 @@ using `Internet Relay Chat`_.
.. _`Internet Relay Chat`: http://www.irchelp.org/
See ``https://ceph.com/irc/`` for how to set up your IRC
client and a list of channels.
The Ceph community gathers in the #ceph channel of the Open and Free Technology
Community (OFTC) IRC network.
Created in 1988, Internet Relay Chat (IRC) is a relay-based, real-time chat
protocol. It is mainly designed for group (many-to-many) communication in
discussion forums called channels, but also allows one-to-one communication via
private message. On IRC you can talk to many other members using Ceph, on
topics ranging from idle chit-chat to support questions. Though a channel might
have many people in it at any one time, they might not always be at their
keyboard; so if no-one responds, just wait around and someone will hopefully
answer soon enough.
Registration
^^^^^^^^^^^^
If you intend to use the IRC service on a continued basis, you are advised to
register an account. Registering gives you a unique IRC identity and allows you
to access channels where unregistered users have been locked out for technical
reasons.
See ``the official OFTC (Open and Free Technology Community) documentation's
registration instructions
<https://www.oftc.net/Services/#register-your-account>`` to learn how to
register your IRC account.
Channels
~~~~~~~~
To connect to the OFTC IRC network, download an IRC client and configure it to
connect to ``irc.oftc.net``. Then join one or more of the channels. Discussions
inside #ceph are logged and archives are available online.
Here are the real-time discussion channels for the Ceph community:
- #ceph
- #ceph-devel
- #cephfs
- #ceph-dashboard
- #ceph-orchestrators
- #sepia
.. _submitting-patches:
@ -152,7 +190,7 @@ file `CONTRIBUTING.rst`_ in the top-level directory of the source-code
tree. There may be some overlap between this guide and that file.
.. _`CONTRIBUTING.rst`:
https://github.com/ceph/ceph/blob/master/CONTRIBUTING.rst
https://github.com/ceph/ceph/blob/main/CONTRIBUTING.rst
All newcomers are encouraged to read that file carefully.
@ -252,7 +290,7 @@ See :ref:`kubernetes-dev`
Backporting
-----------
All bugfixes should be merged to the ``master`` branch before being
All bugfixes should be merged to the ``main`` branch before being
backported. To flag a bugfix for backporting, make sure it has a
`tracker issue`_ associated with it and set the ``Backport`` field to a
comma-separated list of previous releases (e.g. "hammer,jewel") that you think
@ -263,6 +301,36 @@ The rest (including the actual backporting) will be taken care of by the
.. _`tracker issue`: http://tracker.ceph.com/
.. _`Stable Releases and Backports`: http://tracker.ceph.com/projects/ceph-releases/wiki
Dependabot
----------
Dependabot is a GitHub bot that scans the dependencies in the repositories for
security vulnerabilities (CVEs). If a fix is available for a discovered CVE,
Dependabot creates a pull request to update the dependency.
Dependabot also indicates the compatibility score of the upgrade. This score is
based on the number of CI failures that occur in other GitHub repositories
where the fix was applied.
With some configuration, Dependabot can perform non-security updates (for
example, it can upgrade to the latest minor version or patch version).
Dependabot supports `several languages and package managers
<https://docs.github.com/en/code-security/dependabot/dependabot-version-updates/about-dependabot-version-updates#supported-repositories-and-ecosystems>`_.
As of July 2022, the Ceph project receives alerts only from pip (based on the
`requirements.txt` files) and npm (`package*.json`). It is possible to extend
these alerts to git submodules, Golang, and Java. As of July 2022, there is no
support for C++ package managers such as vcpkg, conan, C++20 modules.
Many of the dependencies discovered by Dependabot will best be updated
elsewhere than the Ceph Github repository (distribution packages, for example,
will be a better place to update some of the dependencies). Nonetheless, the
list of new and existing vulnerabilities generated by Dependabot will be
useful.
`Here is an example of a Dependabot pull request.
<https://github.com/ceph/ceph/pull/46998>`_
Guidance for use of cluster log
-------------------------------

View File

@ -18,10 +18,10 @@ What ?
Where ?
^^^^^^^
Features are merged to the *master* branch. Bug fixes should be merged to the
Features are merged to the *main* branch. Bug fixes should be merged to the
corresponding named branch (e.g. *nautilus* for 14.0.z, *pacific* for 16.0.z,
etc.). However, this is not mandatory - bug fixes and documentation
enhancements can be merged to the *master* branch as well, since the *master*
enhancements can be merged to the *main* branch as well, since the *main*
branch is itself occasionally merged to the named branch during the development
releases phase. In either case, if a bug fix is important it can also be
flagged for backport to one or more previous stable releases.
@ -32,16 +32,16 @@ When ?
After each stable release, candidate branches for previous releases enter
phase 2 (see below). For example: the *jewel* named branch was created when
the *infernalis* release candidates entered phase 2. From this point on,
*master* was no longer associated with *infernalis*. After he named branch of
the next stable release is created, *master* will be occasionally merged into
*main* was no longer associated with *infernalis*. After he named branch of
the next stable release is created, *main* will be occasionally merged into
it.
Branch merges
^^^^^^^^^^^^^
* The latest stable release branch is merged periodically into master.
* The master branch is merged periodically into the branch of the stable release.
* The master is merged into the stable release branch
* The latest stable release branch is merged periodically into main.
* The main branch is merged periodically into the branch of the stable release.
* The main is merged into the stable release branch
immediately after each development (x.0.z) release.
Stable release candidates (i.e. x.1.z) phase 1
@ -56,12 +56,12 @@ Where ?
^^^^^^^
The stable release branch (e.g. *jewel* for 10.0.z, *luminous*
for 12.0.z, etc.) or *master*. Bug fixes should be merged to the named
for 12.0.z, etc.) or *main*. Bug fixes should be merged to the named
branch corresponding to the stable release candidate (e.g. *jewel* for
10.1.z) or to *master*. During this phase, all commits to *master* will be
10.1.z) or to *main*. During this phase, all commits to *main* will be
merged to the named branch, and vice versa. In other words, it makes
no difference whether a commit is merged to the named branch or to
*master* - it will make it into the next release candidate either way.
*main* - it will make it into the next release candidate either way.
When ?
^^^^^^
@ -72,9 +72,9 @@ x.1.0 tag is set in the release branch.
Branch merges
^^^^^^^^^^^^^
* The stable release branch is merged periodically into *master*.
* The *master* branch is merged periodically into the stable release branch.
* The *master* branch is merged into the stable release branch
* The stable release branch is merged periodically into *main*.
* The *main* branch is merged periodically into the stable release branch.
* The *main* branch is merged into the stable release branch
immediately after each x.1.z release candidate.
Stable release candidates (i.e. x.1.z) phase 2
@ -90,7 +90,7 @@ Where ?
The stable release branch (e.g. *mimic* for 13.0.z, *octopus* for 15.0.z
,etc.). During this phase, all commits to the named branch will be merged into
*master*. Cherry-picking to the named branch during release candidate phase 2
*main*. Cherry-picking to the named branch during release candidate phase 2
is performed manually since the official backporting process begins only when
the release is pronounced "stable".
@ -102,7 +102,7 @@ After Sage Weil announces that it is time for phase 2 to happen.
Branch merges
^^^^^^^^^^^^^
* The stable release branch is occasionally merged into master.
* The stable release branch is occasionally merged into main.
Stable releases (i.e. x.2.z)
----------------------------
@ -112,8 +112,8 @@ What ?
* Bug fixes
* Features are sometime accepted
* Commits should be cherry-picked from *master* when possible
* Commits that are not cherry-picked from *master* must pertain to a bug unique to
* Commits should be cherry-picked from *main* when possible
* Commits that are not cherry-picked from *main* must pertain to a bug unique to
the stable release
* See also the `backport HOWTO`_ document

View File

@ -166,5 +166,12 @@ Unit test caveats
explicitly linked against something else. This enables tools such as
**valgrind** to be used in the tests.
#. Google Test unit testing library hides the client output from the shell.
In order to debug the client after setting the desired debug level
(e.g ``ceph config set client debug_rbd 20``), the debug log file can
be found at ``build/out/client.admin.<pid>.log``.
This can also be handy when examining teuthology failed unit test
jobs, the job's debug level can be set at the relevant yaml file.
.. _make check:
.. _teuthology framework: https://github.com/ceph/teuthology

View File

@ -86,6 +86,10 @@ separate file, like this::
.. graphviz:: myfile.dot
See the `Dot User's Manual <https://www.graphviz.org/pdf/dotguide.pdf>`_ by
Emden R. Gansner, Eleftherios Koutsofios, and Stephen North for examples of
digraphs. This is especially useful if this is your first time encountering
GraphViz.
Ditaa
-----

View File

@ -4,11 +4,7 @@
.. graphviz::
/*
* Rough outline of object store module dependencies
*/
digraph object_store {
digraph object_store {
size="7,7";
node [color=lightblue2, style=filled, fontname="Serif"];
@ -68,3 +64,4 @@
.. todo:: write more here

View File

@ -6,47 +6,52 @@ Glossary
--------
*chunk*
when the encoding function is called, it returns chunks of the same
size. Data chunks which can be concatenated to reconstruct the original
object and coding chunks which can be used to rebuild a lost chunk.
When the encoding function is called, it returns chunks of the same
size as each other. There are two kinds of chunks: (1) *data
chunks*, which can be concatenated to reconstruct the original
object, and (2) *coding chunks*, which can be used to rebuild a
lost chunk.
*chunk rank*
the index of a chunk when returned by the encoding function. The
rank of the first chunk is 0, the rank of the second chunk is 1
etc.
*stripe*
when an object is too large to be encoded with a single call,
each set of chunks created by a call to the encoding function is
called a stripe.
*shard|strip*
an ordered sequence of chunks of the same rank from the same
object. For a given placement group, each OSD contains shards of
the same rank. When dealing with objects that are encoded with a
single operation, *chunk* is sometime used instead of *shard*
because the shard is made of a single chunk. The *chunks* in a
*shard* are ordered according to the rank of the stripe they belong
to.
The index of a chunk, as determined by the encoding function. The
rank of the first chunk is 0, the rank of the second chunk is 1,
and so on.
*K*
the number of data *chunks*, i.e. the number of *chunks* in which the
original object is divided. For instance if *K* = 2 a 10KB object
will be divided into *K* objects of 5KB each.
The number of data chunks into which an object is divided. For
example, if *K* = 2, then a 10KB object is divided into two objects
of 5KB each.
*M*
the number of coding *chunks*, i.e. the number of additional *chunks*
computed by the encoding functions. If there are 2 coding *chunks*,
it means 2 OSDs can be out without losing data.
The number of coding chunks computed by the encoding function. *M*
is equal to the number of OSDs that can be missing from the cluster
without the cluster suffering data loss. For example, if there are
two coding chunks, then two OSDs can be missing without data loss.
*N*
the number of data *chunks* plus the number of coding *chunks*,
i.e. *K+M*.
The number of data chunks plus the number of coding chunks: that
is, *K* + *M*.
*rate*
the proportion of the *chunks* that contains useful information, i.e. *K/N*.
For instance, for *K* = 9 and *M* = 3 (i.e. *K+M* = *N* = 12) the rate is
*K* = 9 / *N* = 12 = 0.75, i.e. 75% of the chunks contain useful information.
The proportion of the total chunks containing useful information:
that is, *K* divided by *N*. For example, suppose that *K* = 9 and
*M* = 3. This would mean that *N* = 12 (because *K* + *M* = 9 + 3).
Therefore, the *rate* (*K* / *N*) would be 9 / 12 = 0.75. In other
words, 75% of the chunks would contain useful information.
*shard* (also called *strip*)
An ordered sequence of chunks of the same rank from the same object. For a
given placement group, each OSD contains shards of the same rank. In the
special case in which an object is encoded with only one call to the
encoding function, the term *chunk* may be used instead of *shard* because
the shard is made of a single chunk. The chunks in a shard are ordered
according to the rank of the stripe (see *stripe* below) they belong to.
*stripe*
If an object is so large that encoding it requires more than one
call to the encoding function, each of these calls creates a set of
chunks called a *stripe*.
The definitions are illustrated as follows (PG stands for placement group):
::
@ -71,8 +76,8 @@ The definitions are illustrated as follows (PG stands for placement group):
| ... | | ... |
+-------------------------+ +-------------------------+
Table of content
----------------
Table of contents
-----------------
.. toctree::
:maxdepth: 1

View File

@ -2,172 +2,224 @@
Ceph Release Process
======================
1. Build environment
====================
Prerequisites
=============
There are multiple build environments, debian based packages are built via pbuilder for multiple distributions. The build hosts are listed in the ``deb_hosts`` file, and the list of distributions are in ``deb_dist``. All distributions are build on each of the build hosts. Currently there is 1 64 bit and 1 32 bit build host.
Signing Machine
---------------
The signing machine is a virtual machine in the `Sepia lab
<https://wiki.sepia.ceph.com/doku.php?id=start>`_. SSH access to the signing
machine is limited to the usual Infrastructure Admins along with a few other
component leads (e.g., nfs-ganesha, ceph-iscsi).
The RPM based packages are built natively, so one distribution per build host. The list of hosts is found in ``rpm_hosts``.
The ``ubuntu`` user on the machine has some `build scripts <https://github.com/ceph/ceph-build/tree/main/scripts>`_ that help with pulling, pushing, and signing packages.
Prior to building, it's necessary to update the pbuilder seed tarballs::
The GPG signing key permanently lives on a `Nitrokey Pro <https://shop.nitrokey.com/shop/product/nkpr2-nitrokey-pro-2-3>`_ and is passed through to the VM via RHV. This helps to ensure that the key cannot be exported or leave the datacenter in any way.
./update_all_pbuilders.sh
New Major Releases
------------------
For each new major (alphabetical) release, you must create one ``ceph-release`` RPM for each RPM repo (e.g., one for el8 and one for el9). `chacra <https://github.com/ceph/chacra>`_ is a python service we use to store DEB and RPM repos. The chacra repos are configured to include this ceph-release RPM, but it must be built separately. You must make sure that chacra is properly configured to include this RPM for each particular release.
2. Setup keyring for signing packages
=====================================
1. Update chacra so it is aware of the new Ceph release. See `this PR <https://github.com/ceph/chacra/pull/219>`_ for an example.
2. Redeploy chacra (e.g., ``ansible-playbook chacra.ceph.com.yml``)
3. Run https://jenkins.ceph.com/view/all/job/ceph-release-rpm/
::
Summarized build process
========================
export GNUPGHOME=<path to keyring dir>
1. QE finishes testing and finds a stopping point. That commit is pushed to the ``$release-release`` branch in ceph.git (e.g., ``quincy-release``). This allows work to continue in the working ``$release`` branch without having to freeze it during the release process.
2. The Ceph Council approves and notifies the "Build Lead".
3. The "Build Lead" starts the `Jenkins multijob <https://jenkins.ceph.com/view/all/job/ceph>`_, which triggers all builds.
4. Packages are pushed to chacra.ceph.com.
5. Packages are pulled from chacra.ceph.com to the Signer VM.
6. Packages are signed.
7. Packages are pushed to download.ceph.com.
8. Release containers are built and pushed to quay.io.
# verify it's accessible
gpg --list-keys
Hotfix Release Process Deviation
--------------------------------
The release key should be present::
A hotfix release has a couple differences.
pub 4096R/17ED316D 2012-05-20
uid Ceph Release Key <sage@newdream.net>
1. Check out the most recent tag. For example, if we're releasing a hotfix on top of 17.2.3, ``git checkout -f -B quincy-release origin/v17.2.3``
2. ``git cherry-pick -x`` the necessary hotfix commits
3. ``git push -f origin quincy-release``
4. Notify the "Build Lead" to start the build.
5. The "Build Lead" should set ``RELEASE_TYPE=HOTFIX`` instead of ``STABLE``.
Security Release Process Deviation
----------------------------------
3. Set up build area
====================
A security/CVE release is similar to a hotfix release with two differences:
Clone the ceph and ceph-build source trees::
1. The fix should be pushed to the `ceph-private <https://github.com/ceph/ceph-private>`_ repo instead of ceph.git (requires GitHub Admin Role).
2. The tags (e.g., v17.2.4) must be manually pushed to ceph.git by the "Build Lead."
git clone http://github.com/ceph/ceph.git
git clone http://github.com/ceph/ceph-build.git
1. Check out the most recent tag. For example, if we're releasing a security fix on top of 17.2.3, ``git checkout -f -B quincy-release origin/v17.2.3``
2. ``git cherry-pick -x`` the necessary security fix commits
3. ``git remote add security git@github.com:ceph/ceph-private.git``
4. ``git push -f security quincy-release``
5. Notify the "Build Lead" to start the build.
6. The "Build Lead" should set ``RELEASE_TYPE=SECURITY`` instead of ``STABLE``.
7. Finally, the `ceph-tag <https://github.com/ceph/ceph-build/blob/main/ansible/roles/ceph-release/tasks/push.yml>`_ steps need to be manually run by the "Build Lead" as close to the Announcement time as possible::
In the ceph source directory, checkout next branch (for point releases use the {codename} branch)::
# Example using quincy pretending 17.2.4 is the security release version
# Add the ceph-releases repo (also requires GitHub Admin Role). The `ceph-setup <https://jenkins.ceph.com/job/ceph-setup>`_ job will have already created and pushed the tag to ceph-releases.git.
git remote add releases git@github.com:ceph/ceph-releases.git
git fetch --all
# Check out the version commit
git checkout -f -B quincy-release releases/quincy-release
git push -f origin quincy-release
git push origin v17.2.4
# Now create a Pull Request of quincy-release targeting quincy to merge the version commit and security fixes back into the quincy branch
git checkout next
1. Preparing the release branch
===============================
Checkout the submodules::
Once QE has determined a stopping point in the working (e.g., ``quincy``) branch, that commit should be pushed to the corresponding ``quincy-release`` branch.
git submodule update --force --init --recursive
Notify the "Build Lead" that the release branch is ready.
4. Update Build version numbers
================================
Substitute the ceph release number where indicated below by the string ``0.xx``.
Edit configure.ac and update the version number. Example diff::
-AC_INIT([ceph], [0.54], [ceph-devel@vger.kernel.org])
+AC_INIT([ceph], [0.55], [ceph-devel@vger.kernel.org])
Update the version number in the debian change log::
DEBEMAIL user@host dch -v 0.xx-1
Commit the changes::
git commit -a
Tag the release::
../ceph-build/tag-release v0.xx
5. Create Makefiles
===================
The actual configure options used to build packages are in the
``ceph.spec.in`` and ``debian/rules`` files. At this point we just
need to create a Makefile.::
./do_autogen.sh
6. Run the release scripts
==========================
This creates tarballs and copies them, with other needed files to
the build hosts listed in deb_hosts and rpm_hosts, runs a local build
script, then rsyncs the results back to the specified release directory.::
../ceph-build/do_release.sh /tmp/release
7. Create RPM Repo
==================
Copy the rpms to the destination repo::
mkdir /tmp/rpm-repo
../ceph-build/push_to_rpm_repo.sh /tmp/release /tmp/rpm-repo 0.xx
Next add any additional rpms to the repo that are needed such as leveldb.
See RPM Backports section
Finally, sign the rpms and build the repo indexes::
../ceph-build/sign_and_index_rpm_repo.sh /tmp/release /tmp/rpm-repo 0.xx
8. Create Debian repo
2. Starting the build
=====================
The key-id used below is the id of the ceph release key from step 2::
We'll use a stable/regular 15.2.17 release of Octopus as an example throughout this document.
mkdir /tmp/debian-repo
../ceph-build/gen_reprepro_conf.sh /tmp/debian-repo key-id
../ceph-build/push_to_deb_repo.sh /tmp/release /tmp/debian-repo 0.xx main
1. Browse to https://jenkins.ceph.com/view/all/job/ceph/build?delay=0sec
2. Log in with GitHub OAuth
3. Set the parameters as necessary::
BRANCH=octopus
TAG=checked
VERSION=15.2.17
RELEASE_TYPE=STABLE
ARCHS=x86_64 arm64
Next add any addition debian packages that are needed such as leveldb.
See the Debian Backports section below.
4. Use https://docs.ceph.com/en/latest/start/os-recommendations/?highlight=debian#platforms to determine the ``DISTROS`` parameter. For example,
Debian packages are signed when added to the repo, so no further action is
needed.
+-------------------+-------------------------------------------+
| Release | Distro Codemap |
+===================+===========================================+
| octopus (15.X.X) | ``focal bionic centos7 centos8 buster`` |
+-------------------+-------------------------------------------+
| pacific (16.X.X) | ``focal bionic centos8 buster bullseye`` |
+-------------------+-------------------------------------------+
| quincy (17.X.X) | ``focal centos8 centos9 bullseye`` |
+-------------------+-------------------------------------------+
5. Click ``Build``.
9. Push repos to ceph.org
==========================
3. Release Notes
================
For a development release::
Packages take hours to build. Use those hours to create the Release Notes and Announcements:
rcp ceph-0.xx.tar.bz2 ceph-0.xx.tar.gz \
ceph_site@ceph.com:ceph.com/downloads/.
rsync -av /tmp/rpm-repo/0.xx/ ceph_site@ceph.com:ceph.com/rpm-testing
rsync -av /tmp/debian-repo/ ceph_site@ceph.com:ceph.com/debian-testing
1. ceph.git Release Notes (e.g., `v15.2.17's ceph.git (docs.ceph.com) PR <https://github.com/ceph/ceph/pull/47198>`_)
2. ceph.io Release Notes (e.g., `v15.2.17's ceph.io.git (www.ceph.io) PR <https://github.com/ceph/ceph.io/pull/427>`_)
3. E-mail announcement
For a stable release, replace {CODENAME} with the release codename (e.g., ``argonaut`` or ``bobtail``)::
See `the Ceph Tracker wiki page that explains how to write the release notes <https://tracker.ceph.com/projects/ceph-releases/wiki/HOWTO_write_the_release_notes>`_.
rcp ceph-0.xx.tar.bz2 \
ceph_site@ceph.com:ceph.com/downloads/ceph-0.xx.tar.bz2
rcp ceph-0.xx.tar.gz \
ceph_site@ceph.com:ceph.com/downloads/ceph-0.xx.tar.gz
rsync -av /tmp/rpm-repo/0.xx/ ceph_site@ceph.com:ceph.com/rpm-{CODENAME}
rsync -auv /tmp/debian-repo/ ceph_site@ceph.com:ceph.com/debian-{CODENAME}
4. Signing and Publishing the Build
===================================
10. Update Git
==============
#. Obtain the sha1 of the version commit from the `build job <https://jenkins.ceph.com/view/all/job/ceph>`_ or the ``sha1`` file created by the `ceph-setup <https://jenkins.ceph.com/job/ceph-setup/>`_ job.
Point release
-------------
#. Download the packages from chacra.ceph.com to the signing virtual machine. These packages get downloaded to ``/opt/repos`` where the `Sepia Lab Long Running (Ceph) Cluster <https://wiki.sepia.ceph.com/doku.php?id=services:longrunningcluster>`_ is mounted.
For point releases just push the version number update to the
branch and the new tag::
.. prompt:: bash $
git push origin {codename}
git push origin v0.xx
ssh ubuntu@signer.front.sepia.ceph.com
sync-pull ceph [pacific|quincy|etc] <sha1>
Example::
$ sync-pull ceph octopus 8a82819d84cf884bd39c17e3236e0632ac146dc4
sync for: ceph octopus
********************************************
Found the most packages (332) in ubuntu/bionic.
No JSON object could be decoded
No JSON object could be decoded
ubuntu@chacra.ceph.com:/opt/repos/ceph/octopus/8a82819d84cf884bd39c17e3236e0632ac146dc4/ubuntu/bionic/flavors/default/* /opt/repos/ceph/octopus-15.2.17/debian/jessie/
--------------------------------------------
receiving incremental file list
db/
db/checksums.db
180.22K 100% 2.23MB/s 0:00:00 (xfr#1, to-chk=463/467)
db/contents.cache.db
507.90K 100% 1.95MB/s 0:00:00 (xfr#2, to-chk=462/467)
db/packages.db
etc...
Development and Stable releases
-------------------------------
#. Sign the DEBs:
For a development release, update tags for ``ceph.git``::
.. prompt:: bash
git push origin v0.xx
git push origin HEAD:last
git checkout master
git merge next
git push origin master
git push origin HEAD:next
merfi gpg /opt/repos/ceph/octopus-15.2.17/debian
Similarly, for a development release, for both ``teuthology.git`` and ``ceph-qa-suite.git``::
Example::
git checkout master
git reset --hard origin/master
git branch -f last origin/next
git push -f origin last
git push -f origin master:next
$ merfi gpg /opt/repos/ceph/octopus-15.2.17/debian
--> Starting path collection, looking for files to sign
--> 18 matching paths found
--> will sign with the following commands:
--> gpg --batch --yes --armor --detach-sig --output Release.gpg Release
--> gpg --batch --yes --clearsign --output InRelease Release
--> signing: /opt/repos/ceph/octopus-15.2.17/debian/jessie/dists/bionic/Release
--> Running command: gpg --batch --yes --armor --detach-sig --output Release.gpg Release
--> Running command: gpg --batch --yes --clearsign --output InRelease Release
--> signing: /opt/repos/ceph/octopus-15.2.17/debian/jessie/dists/focal/Release
--> Running command: gpg --batch --yes --armor --detach-sig --output Release.gpg Release
--> Running command: gpg --batch --yes --clearsign --output InRelease Release
etc...
#. Sign the RPMs:
.. prompt:: bash
sign-rpms octopus
Example::
$ sign-rpms octopus
Checking packages in: /opt/repos/ceph/octopus-15.2.17/centos/7
signing: /opt/repos/ceph/octopus-15.2.17/centos/7/SRPMS/ceph-release-1-1.el7.src.rpm
/opt/repos/ceph/octopus-15.2.17/centos/7/SRPMS/ceph-release-1-1.el7.src.rpm:
signing: /opt/repos/ceph/octopus-15.2.17/centos/7/SRPMS/ceph-15.2.17-0.el7.src.rpm
/opt/repos/ceph/octopus-15.2.17/centos/7/SRPMS/ceph-15.2.17-0.el7.src.rpm:
signing: /opt/repos/ceph/octopus-15.2.17/centos/7/noarch/ceph-mgr-modules-core-15.2.17-0.el7.noarch.rpm
etc...
5. Publish the packages to download.ceph.com:
.. prompt:: bash $
sync-push octopus
5. Build Containers
===================
Start the following two jobs:
#. https://2.jenkins.ceph.com/job/ceph-container-build-ceph-base-push-imgs/
#. https://2.jenkins.ceph.com/job/ceph-container-build-ceph-base-push-imgs-arm64/
6. Announce the Release
=======================
Version Commit PR
-----------------
The `ceph-tag Jenkins job <https://jenkins.ceph.com/job/ceph-tag>`_ creates a Pull Request in ceph.git that targets the release branch.
If this was a regular release (not a hotfix release or a security release), the only commit in that Pull Request should be the version commit. For example, see `v15.2.17's version commit PR <https://github.com/ceph/ceph/pull/47520>`_.
Request a review and then merge the Pull Request.
Announcing
----------
Publish the Release Notes on ceph.io before announcing the release by email, because the e-mail announcement references the ceph.io blog post.

View File

@ -2,198 +2,355 @@
Ceph Glossary
===============
Ceph is growing rapidly. As firms deploy Ceph, the technical terms such as
"RADOS", "RBD," "RGW" and so forth require corresponding marketing terms
that explain what each component does. The terms in this glossary are
intended to complement the existing technical terminology.
Sometimes more than one term applies to a definition. Generally, the first
term reflects a term consistent with Ceph's marketing, and secondary terms
reflect either technical terms or legacy ways of referring to Ceph systems.
.. glossary::
Ceph Project
The aggregate term for the people, software, mission and infrastructure
of Ceph.
cephx
The Ceph authentication protocol. Cephx operates like Kerberos, but it
has no single point of failure.
:ref:`BlueStore<rados_config_storage_devices_bluestore>`
OSD BlueStore is a storage back end used by OSD daemons, and
was designed specifically for use with Ceph. BlueStore was
introduced in the Ceph Kraken release. In the Ceph Luminous
release, BlueStore became Ceph's default storage back end,
supplanting FileStore. Unlike :term:`filestore`, BlueStore
stores objects directly on Ceph block devices without any file
system interface. Since Luminous (12.2), BlueStore has been
Ceph's default and recommended storage back end.
Ceph
Ceph Platform
All Ceph software, which includes any piece of code hosted at
`https://github.com/ceph`_.
Ceph is a distributed network storage and file system with
distributed metadata management and POSIX semantics.
Ceph Block Device
A software instrument that orchestrates the storage of
block-based data in Ceph. Ceph Block Device (also called "RBD",
or "RADOS block device") splits block-based application data
into "chunks". RADOS stores these chunks as objects. Ceph Block
Device orchestrates the storage of those objects across the
storage cluster. See also :term:`RBD`.
Ceph Block Storage
One of the three kinds of storage supported by Ceph (the other
two are object storage and file storage). Ceph Block Storage is
the block storage "product", which refers to block-storage
related services and capabilities when used in conjunction with
the collection of (1) ``librbd`` (a python module that provides
file-like access to :term:`RBD` images), (2) a hypervisor such
as QEMU or Xen, and (3) a hypervisor abstraction layer such as
``libvirt``.
Ceph Client
Any of the Ceph components that can access a Ceph Storage
Cluster. This includes the Ceph Object Gateway, the Ceph Block
Device, the Ceph File System, and their corresponding
libraries. It also includes kernel modules, and FUSEs
(Filesystems in USERspace).
Ceph Client Libraries
The collection of libraries that can be used to interact with
components of the Ceph Cluster.
Ceph Cluster Map
See :term:`Cluster Map`
Ceph Dashboard
:ref:`The Ceph Dashboard<mgr-dashboard>` is a built-in
web-based Ceph management and monitoring application through
which you can inspect and administer various resources within
the cluster. It is implemented as a :ref:`ceph-manager-daemon`
module.
Ceph File System
See :term:`CephFS`
:ref:`CephFS<ceph-file-system>`
The **Ceph F**\ile **S**\ystem, or CephFS, is a
POSIX-compliant file system built on top of Cephs distributed
object store, RADOS. See :ref:`CephFS Architecture
<arch-cephfs>` for more details.
Ceph Interim Release
See :term:`Releases`.
Ceph Kernel Modules
The collection of kernel modules that can be used to interact
with the Ceph Cluster (for example: ``ceph.ko``, ``rbd.ko``).
:ref:`Ceph Manager<ceph-manager-daemon>`
The Ceph manager daemon (ceph-mgr) is a daemon that runs
alongside monitor daemons to provide monitoring and interfacing
to external monitoring and management systems. Since the
Luminous release (12.x), no Ceph cluster functions properly
unless it contains a running ceph-mgr daemon.
Ceph Manager Dashboard
See :term:`Ceph Dashboard`.
Ceph Metadata Server
See :term:`MDS`.
Ceph Monitor
A daemon that maintains a map of the state of the cluster. This
"cluster state" includes the monitor map, the manager map, the
OSD map, and the CRUSH map. A Ceph cluster must contain a
minimum of three running monitors in order to be both redundant
and highly-available. Ceph monitors and the nodes on which they
run are often referred to as "mon"s. See :ref:`Monitor Config
Reference <monitor-config-reference>`.
Ceph Node
A Ceph node is a unit of the Ceph Cluster that communicates with
other nodes in the Ceph Cluster in order to replicate and
redistribute data. All of the nodes together are called the
:term:`Ceph Storage Cluster`. Ceph nodes include :term:`OSD`\s,
:term:`Ceph Monitor`\s, :term:`Ceph Manager`\s, and
:term:`MDS`\es. The term "node" is usually equivalent to "host"
in the Ceph documentation. If you have a running Ceph Cluster,
you can list all of the nodes in it by running the command
``ceph node ls all``.
:ref:`Ceph Object Gateway<object-gateway>`
An object storage interface built on top of librados. Ceph
Object Gateway provides a RESTful gateway between applications
and Ceph storage clusters.
Ceph Object Storage
See :term:`Ceph Object Store`.
Ceph Object Store
A Ceph Object Store consists of a :term:`Ceph Storage Cluster`
and a :term:`Ceph Object Gateway` (RGW).
:ref:`Ceph OSD<rados_configuration_storage-devices_ceph_osd>`
Ceph **O**\bject **S**\torage **D**\aemon. The Ceph OSD
software, which interacts with logical disks (:term:`OSD`).
Around 2013, there was an attempt by "research and industry"
(Sage's own words) to insist on using the term "OSD" to mean
only "Object Storage Device", but the Ceph community has always
persisted in using the term to mean "Object Storage Daemon" and
no less an authority than Sage Weil himself confirms in
November of 2022 that "Daemon is more accurate for how Ceph is
built" (private correspondence between Zac Dover and Sage Weil,
07 Nov 2022).
Ceph OSD Daemon
See :term:`Ceph OSD`.
Ceph OSD Daemons
See :term:`Ceph OSD`.
Ceph Platform
All Ceph software, which includes any piece of code hosted at
`https://github.com/ceph`_.
Ceph Point Release
See :term:`Releases`.
Ceph Project
The aggregate term for the people, software, mission and
infrastructure of Ceph.
Ceph Release
See :term:`Releases`.
Ceph Release Candidate
See :term:`Releases`.
Ceph Stable Release
See :term:`Releases`.
Ceph System
Ceph Stack
A collection of two or more components of Ceph.
Ceph Node
Node
Host
Any single machine or server in a Ceph System.
:ref:`Ceph Storage Cluster<arch-ceph-storage-cluster>`
The collection of :term:`Ceph Monitor`\s, :term:`Ceph
Manager`\s, :term:`Ceph Metadata Server`\s, and :term:`OSD`\s
that work together to store and replicate data for use by
applications, Ceph Users, and :term:`Ceph Client`\s. Ceph
Storage Clusters receive data from :term:`Ceph Client`\s.
Ceph Storage Cluster
Ceph Object Store
RADOS
RADOS Cluster
Reliable Autonomic Distributed Object Store
The core set of storage software which stores the user's data (MON+OSD).
Ceph Cluster Map
Cluster Map
The set of maps comprising the monitor map, OSD map, PG map, MDS map and
CRUSH map. See `Cluster Map`_ for details.
Ceph Object Storage
The object storage "product", service or capabilities, which consists
essentially of a Ceph Storage Cluster and a Ceph Object Gateway.
Ceph Object Gateway
RADOS Gateway
RGW
The S3/Swift gateway component of Ceph.
Ceph Block Device
RBD
The block storage component of Ceph.
Ceph Block Storage
The block storage "product," service or capabilities when used in
conjunction with ``librbd``, a hypervisor such as QEMU or Xen, and a
hypervisor abstraction layer such as ``libvirt``.
Ceph File System
CephFS
Ceph FS
The POSIX filesystem components of Ceph. Refer
:ref:`CephFS Architecture <arch-cephfs>` and :ref:`ceph-file-system` for
more details.
cephx
The Ceph authentication protocol. Cephx operates like Kerberos,
but it has no single point of failure.
Cloud Platforms
Cloud Stacks
Third party cloud provisioning platforms such as OpenStack, CloudStack,
OpenNebula, ProxMox, etc.
Third party cloud provisioning platforms such as OpenStack,
CloudStack, OpenNebula, and Proxmox VE.
Object Storage Device
OSD
A physical or logical storage unit (*e.g.*, LUN).
Sometimes, Ceph users use the
term "OSD" to refer to :term:`Ceph OSD Daemon`, though the
proper term is "Ceph OSD".
Cluster Map
The set of maps consisting of the monitor map, OSD map, PG map,
MDS map, and CRUSH map, which together report the state of the
Ceph cluster. See :ref:`the "Cluster Map" section of the
Architecture document<architecture_cluster_map>` for details.
Ceph OSD Daemon
Ceph OSD Daemons
Ceph OSD
The Ceph OSD software, which interacts with a logical
disk (:term:`OSD`). Sometimes, Ceph users use the
term "OSD" to refer to "Ceph OSD Daemon", though the
proper term is "Ceph OSD".
CRUSH
Controlled Replication Under Scalable Hashing. It is the
algorithm Ceph uses to compute object storage locations.
OSD id
The integer that defines an OSD. It is generated by the monitors as part
of the creation of a new OSD.
CRUSH rule
The CRUSH data placement rule that applies to a particular
pool(s).
OSD fsid
This is a unique identifier used to further improve the uniqueness of an
OSD and it is found in the OSD path in a file called ``osd_fsid``. This
``fsid`` term is used interchangeably with ``uuid``
DAS
**D**\irect-\ **A**\ttached **S**\torage. Storage that is
attached directly to the computer accessing it, without passing
through a network. Contrast with NAS and SAN.
OSD uuid
Just like the OSD fsid, this is the OSD unique identifier and is used
interchangeably with ``fsid``
:ref:`Dashboard<mgr-dashboard>`
A built-in web-based Ceph management and monitoring application
to administer various aspects and objects of the cluster. The
dashboard is implemented as a Ceph Manager module. See
:ref:`mgr-dashboard` for more details.
bluestore
OSD BlueStore is a new back end for OSD daemons (kraken and newer
versions). Unlike :term:`filestore` it stores objects directly on the
Ceph block devices without any file system interface.
Dashboard Module
Another name for :term:`Dashboard`.
Dashboard Plugin
filestore
A back end for OSD daemons, where a Journal is needed and files are
written to the filesystem.
A back end for OSD daemons, where a Journal is needed and files
are written to the filesystem.
FQDN
**F**\ully **Q**\ualified **D**\omain **N**\ame. A domain name
that is applied to a node in a network and that specifies the
node's exact location in the tree hierarchy of the DNS.
In the context of Ceph cluster administration, FQDNs are often
applied to hosts. In this documentation, the term "FQDN" is
used mostly to distinguish between FQDNs and relatively simpler
hostnames, which do not specify the exact location of the host
in the tree hierarchy of the DNS but merely name the host.
Host
Any single machine or server in a Ceph Cluster. See :term:`Ceph
Node`.
LVM tags
Extensible metadata for LVM volumes and groups. It is used to
store Ceph-specific information about devices and its
relationship with OSDs.
:ref:`MDS<cephfs_add_remote_mds>`
The Ceph **M**\eta\ **D**\ata **S**\erver daemon. Also referred
to as "ceph-mds". The Ceph metadata server daemon must be
running in any Ceph cluster that runs the CephFS file system.
The MDS stores all filesystem metadata.
MGR
The Ceph manager software, which collects all the state from
the whole cluster in one place.
Ceph Monitor
MON
The Ceph monitor software.
Ceph Manager
MGR
The Ceph manager software, which collects all the state from the whole
cluster in one place.
Node
See :term:`Ceph Node`.
Ceph Manager Dashboard
Ceph Dashboard
Dashboard Module
Dashboard Plugin
Dashboard
A built-in web-based Ceph management and monitoring application to
administer various aspects and objects of the cluster. The dashboard is
implemented as a Ceph Manager module. See :ref:`mgr-dashboard` for more
details.
Object Storage Device
See :term:`OSD`.
Ceph Metadata Server
MDS
The Ceph metadata software.
OSD
Probably :term:`Ceph OSD`, but not necessarily. Sometimes
(especially in older correspondence, and especially in
documentation that is not written specifically for Ceph), "OSD"
means "**O**\bject **S**\torage **D**\evice", which refers to a
physical or logical storage unit (for example: LUN). The Ceph
community has always used the term "OSD" to refer to
:term:`Ceph OSD Daemon` despite an industry push in the
mid-2010s to insist that "OSD" should refer to "Object Storage
Device", so it is important to know which meaning is intended.
Ceph Clients
Ceph Client
The collection of Ceph components which can access a Ceph Storage
Cluster. These include the Ceph Object Gateway, the Ceph Block Device,
the Ceph File System, and their corresponding libraries, kernel modules,
and FUSEs.
OSD fsid
This is a unique identifier used to identify an OSD. It is
found in the OSD path in a file called ``osd_fsid``. The
term ``fsid`` is used interchangeably with ``uuid``
Ceph Kernel Modules
The collection of kernel modules which can be used to interact with the
Ceph System (e.g., ``ceph.ko``, ``rbd.ko``).
OSD id
The integer that defines an OSD. It is generated by the
monitors during the creation of each OSD.
Ceph Client Libraries
The collection of libraries that can be used to interact with components
of the Ceph System.
OSD uuid
This is the unique identifier of an OSD. This term is used
interchangeably with ``fsid``
Ceph Release
Any distinct numbered version of Ceph.
:ref:`Pool<rados_pools>`
A pool is a logical partition used to store objects.
Ceph Point Release
Any ad-hoc release that includes only bug or security fixes.
Pools
See :term:`pool`.
Ceph Interim Release
Versions of Ceph that have not yet been put through quality assurance
testing, but may contain new features.
RADOS
**R**\eliable **A**\utonomic **D**\istributed **O**\bject
**S**\tore. RADOS is the object store that provides a scalable
service for variably-sized objects. The RADOS object store is
the core component of a Ceph cluster. `This blog post from
2009
<https://ceph.io/en/news/blog/2009/the-rados-distributed-object-store/>`_
provides a beginner's introduction to RADOS. Readers interested
in a deeper understanding of RADOS are directed to `RADOS: A
Scalable, Reliable Storage Service for Petabyte-scale Storage
Clusters <https://ceph.io/assets/pdfs/weil-rados-pdsw07.pdf>`_.
Ceph Release Candidate
A major version of Ceph that has undergone initial quality assurance
testing and is ready for beta testers.
RADOS Cluster
A proper subset of the Ceph Cluster consisting of
:term:`OSD`\s, :term:`Ceph Monitor`\s, and :term:`Ceph
Manager`\s.
RADOS Gateway
See :term:`RGW`.
Ceph Stable Release
A major version of Ceph where all features from the preceding interim
releases have been put through quality assurance testing successfully.
RBD
The block storage component of Ceph. Also called "RADOS Block
Device" or :term:`Ceph Block Device`.
Releases
Ceph Interim Release
A version of Ceph that has not yet been put through
quality assurance testing. May contain new features.
Ceph Point Release
Any ad hoc release that includes only bug fixes and
security fixes.
Ceph Release
Any distinct numbered version of Ceph.
Ceph Release Candidate
A major version of Ceph that has undergone initial
quality assurance testing and is ready for beta
testers.
Ceph Stable Release
A major version of Ceph where all features from the
preceding interim releases have been put through
quality assurance testing successfully.
Reliable Autonomic Distributed Object Store
The core set of storage software which stores the user's data
(MON+OSD). See also :term:`RADOS`.
:ref:`RGW<object-gateway>`
**R**\ADOS **G**\ate **W**\ay.
The component of Ceph that provides a gateway to both the
Amazon S3 RESTful API and the OpenStack Swift API. Also called
"RADOS Gateway" and "Ceph Object Gateway".
secrets
Secrets are credentials used to perform digital authentication
whenever privileged users must access systems that require
authentication. Secrets can be passwords, API keys, tokens, SSH
keys, private certificates, or encryption keys.
SDS
Software-defined storage.
systemd oneshot
A systemd ``type`` where a command is defined in ``ExecStart``
which will exit upon completion (it is not intended to
daemonize)
Ceph Test Framework
Teuthology
The collection of software that performs scripted tests on Ceph.
CRUSH
Controlled Replication Under Scalable Hashing. It is the algorithm
Ceph uses to compute object storage locations.
CRUSH rule
The CRUSH data placement rule that applies to a particular pool(s).
Pool
Pools
Pools are logical partitions for storing objects.
systemd oneshot
A systemd ``type`` where a command is defined in ``ExecStart`` which will
exit upon completion (it is not intended to daemonize)
LVM tags
Extensible metadata for LVM volumes and groups. It is used to store
Ceph-specific information about devices and its relationship with
OSDs.
.. _https://github.com/ceph: https://github.com/ceph
.. _Cluster Map: ../architecture#cluster-map
.. _Cluster Map: ../architecture#cluster-map

Binary file not shown.

After

Width:  |  Height:  |  Size: 78 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 124 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 95 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 87 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 91 KiB

View File

@ -5,10 +5,18 @@
Ceph uniquely delivers **object, block, and file storage in one unified
system**.
.. warning::
:ref:`If this is your first time using Ceph, read the "Basic Workflow"
page in the Ceph Developer Guide to learn how to contribute to the
Ceph project. (Click anywhere in this paragraph to read the "Basic
Workflow" page of the Ceph Developer Guide.) <basic workflow dev guide>`.
.. raw:: html
<style type="text/css">div.body h3{margin:5px 0px 0px 0px;}</style>
<table cellpadding="10"><colgroup><col width="33%"><col width="33%"><col width="33%"></colgroup><tbody valign="top"><tr><td><h3>Ceph Object Store</h3>
<style type="text/css">div.body h3{margin:5px 0px 0px 0px;}</style>
<table cellpadding="10"><colgroup><col width="33%"><col width="33%">
<col width="33%"></colgroup><tbody valign="top"><tr><td><h3>Ceph Object Store</h3>
- RESTful Interface
- S3- and Swift-compliant APIs
@ -107,6 +115,7 @@ about Ceph, see our `Architecture`_ section.
governance
foundation
ceph-volume/index
releases/general
releases/index
Ceph Releases (general) <https://docs.ceph.com/en/latest/releases/general/>
Ceph Releases (index) <https://docs.ceph.com/en/latest/releases/>
security/index
Glossary <glossary>

View File

@ -2,33 +2,37 @@
Cloning the Ceph Source Code Repository
=========================================
You may clone a Ceph branch of the Ceph source code by going to `github Ceph
Repository`_, selecting a branch (``master`` by default), and clicking the
**Download ZIP** button.
To clone a Ceph branch of the Ceph source code, go to `github Ceph
Repository`_, select a branch (``main`` by default), and click the **Download
ZIP** button.
.. _github Ceph Repository: https://github.com/ceph/ceph
To clone the entire git repository, :ref:`install <install-git>` and configure
``git``.
To clone the entire git repository, install and configure ``git``.
.. _install-git:
Install Git
===========
To install ``git`` on Debian/Ubuntu, execute::
To install ``git`` on Debian/Ubuntu, run the following command:
sudo apt-get install git
.. prompt:: bash $
sudo apt-get install git
To install ``git`` on CentOS/RHEL, execute::
To install ``git`` on CentOS/RHEL, run the following command:
sudo yum install git
.. prompt:: bash $
sudo yum install git
You must also have a ``github`` account. If you do not have a
``github`` account, go to `github.com`_ and register.
Follow the directions for setting up git at
`Set Up Git`_.
You must have a ``github`` account. If you do not have a ``github``
account, go to `github.com`_ and register. Follow the directions for setting
up git at `Set Up Git`_.
.. _github.com: https://github.com
.. _Set Up Git: https://help.github.com/linux-set-up-git
@ -37,26 +41,31 @@ Follow the directions for setting up git at
Add SSH Keys (Optional)
=======================
If you intend to commit code to Ceph or to clone using SSH
To commit code to Ceph or to clone the respository by using SSH
(``git@github.com:ceph/ceph.git``), you must generate SSH keys for github.
.. tip:: If you only intend to clone the repository, you may
.. tip:: If you want only to clone the repository, you can
use ``git clone --recursive https://github.com/ceph/ceph.git``
without generating SSH keys.
To generate SSH keys for ``github``, execute::
To generate SSH keys for ``github``, run the following command:
ssh-keygen
.. prompt:: bash $
Get the key to add to your ``github`` account (the following example
assumes you used the default file path)::
ssh-keygen
cat .ssh/id_rsa.pub
To print the SSH key that you just generated and that you will add to your
``github`` account, use the ``cat`` command. (The following example assumes you
used the default file path.):
.. prompt:: bash $
cat .ssh/id_rsa.pub
Copy the public key.
Go to your ``github`` account, click on "Account Settings" (i.e., the
'tools' icon); then, click "SSH Keys" on the left side navbar.
Go to your ``github`` account, click "Account Settings" (represented by the
'tools' icon), and click "SSH Keys" on the left side navbar.
Click "Add SSH key" in the "SSH Keys" list, enter a name for the key, paste the
key you generated, and press the "Add key" button.
@ -65,37 +74,122 @@ key you generated, and press the "Add key" button.
Clone the Source
================
To clone the Ceph source code repository, execute::
To clone the Ceph source code repository, run the following command:
git clone --recursive https://github.com/ceph/ceph.git
.. prompt:: bash $
Once ``git clone`` executes, you should have a full copy of the Ceph
git clone --recursive https://github.com/ceph/ceph.git
After ``git clone`` has run, you should have a full copy of the Ceph
repository.
.. tip:: Make sure you maintain the latest copies of the submodules
included in the repository. Running ``git status`` will tell you if
the submodules are out of date.
.. tip:: Make sure you maintain the latest copies of the submodules included in
the repository. Running ``git status`` will tell you whether the submodules
are out of date. See :ref:`update-submodules` for more information.
::
cd ceph
git status
.. prompt:: bash $
If your submodules are out of date, run::
cd ceph
git status
git submodule update --force --init --recursive
.. _update-submodules:
Updating Submodules
-------------------
#. Determine whether your submodules are out of date:
.. prompt:: bash $
git status
A. If your submodules are up to date
If your submodules are up to date, the following console output will
appear:
::
On branch main
Your branch is up to date with 'origin/main'.
nothing to commit, working tree clean
If you see this console output, then your submodules are up to date.
You do not need this procedure.
B. If your submodules are not up to date
If your submodules are not up to date, you will see a message that
includes a list of "untracked files". The example here shows such a
list, which was generated from a real situation in which the
submodules were no longer current. Your list of files will not be the
same as this list of files, but this list is provided as an example.
If in your case any untracked files are listed, then you should
continue to the next step of this procedure.
::
On branch main
Your branch is up to date with 'origin/main'.
Untracked files:
(use "git add <file>..." to include in what will be committed)
src/pybind/cephfs/build/
src/pybind/cephfs/cephfs.c
src/pybind/cephfs/cephfs.egg-info/
src/pybind/rados/build/
src/pybind/rados/rados.c
src/pybind/rados/rados.egg-info/
src/pybind/rbd/build/
src/pybind/rbd/rbd.c
src/pybind/rbd/rbd.egg-info/
src/pybind/rgw/build/
src/pybind/rgw/rgw.c
src/pybind/rgw/rgw.egg-info/
nothing added to commit but untracked files present (use "git add" to track)
#. If your submodules are out of date, run the following commands:
.. prompt:: bash $
git submodule update --force --init --recursive
git clean -fdx
git submodule foreach git clean -fdx
If you still have problems with a submodule directory, use ``rm -rf
[directory name]`` to remove the directory. Then run ``git submodule update
--init --recursive`` again.
#. Run ``git status`` again:
.. prompt:: bash $
git status
Your submodules are up to date if you see the following message:
::
On branch main
Your branch is up to date with 'origin/main'.
nothing to commit, working tree clean
Choose a Branch
===============
Once you clone the source code and submodules, your Ceph repository
will be on the ``master`` branch by default, which is the unstable
will be on the ``main`` branch by default, which is the unstable
development branch. You may choose other branches too.
- ``master``: The unstable development branch.
- ``stable``: The bugfix branch.
- ``main``: The unstable development branch.
- ``stable-release-name``: The name of the stable, `Active Releases`_. e.g. ``Pacific``
- ``next``: The release candidate branch.
::
git checkout master
git checkout main
.. _Active Releases: https://docs.ceph.com/en/latest/releases/#active-releases

View File

@ -19,7 +19,7 @@ Ceph Container Images
Official Releases
-----------------
Ceph Container images are available from both Quay and Docker Hub::
Ceph Container images are available from Quay:
https://quay.io/repository/ceph/ceph
https://hub.docker.com/r/ceph

View File

@ -120,7 +120,7 @@ For RPMs::
https://download.ceph.com/rpm-{version}
The major releases of Ceph are summarized at: :ref:`ceph-releases-general`
The major releases of Ceph are summarized at: `Releases`_
.. tip:: For non-US users: There might be a mirror close to you where
to download Ceph from. For more information see: `Ceph Mirrors`_.
@ -396,6 +396,7 @@ line to get the short codename.
.. _Releases: https://docs.ceph.com/en/latest/releases/
.. _the testing Debian repository: https://download.ceph.com/debian-testing/dists
.. _the shaman page: https://shaman.ceph.com
.. _Ceph Mirrors: ../mirrors

View File

@ -1,5 +1,7 @@
:orphan:
.. _ceph_osd-daemon:
========================================
ceph-osd -- ceph object storage daemon
========================================

View File

@ -16,15 +16,10 @@ Synopsis
Description
===========
**ceph-rbdnamer** prints the pool and image name for the given RBD devices
to stdout. It is used by `udev` (using a rule like the one below) to
set up a device symlink.
::
KERNEL=="rbd[0-9]*", PROGRAM="/usr/bin/ceph-rbdnamer %n", SYMLINK+="rbd/%c{1}/%c{2}"
**ceph-rbdnamer** prints the pool, namespace, image and snapshot names
for a given RBD device to stdout. It is used by `udev` device manager
to set up RBD device symlinks. The appropriate `udev` rules are
provided in a file named `50-rbd.rules`.
Availability
============

View File

@ -43,18 +43,6 @@ Descriptions of fields
cap hit rate
.. describe:: rlat
read latency
.. describe:: wlat
write latency
.. describe:: mlat
metadata latency
.. describe:: dlease
dentry lease rate
@ -95,6 +83,29 @@ Descriptions of fields
speed of write IOs compared with the last refresh
.. describe:: rlatavg
average read latency
.. describe:: rlatsd
standard deviation (variance) for read latency
.. describe:: wlatavg
average write latency
.. describe:: wlatsd
standard deviation (variance) for write latency
.. describe:: mlatavg
average metadata latency
.. describe:: mlatsd
standard deviation (variance) for metadata latency
Availability
============

View File

@ -108,6 +108,16 @@ pools; it only runs simulations by mapping values in the range
shows that value **24** is mapped to devices **[11,6]** by rule
**1**.
One of the following is required when using the ``--show-mappings`` option:
(a) ``--num-rep``
(b) both ``--min-rep`` and ``--max-rep``
``--num-rep`` stands for "number of replicas, indicates the number of
replicas in a pool, and is used to specify an exact number of replicas (for
example ``--num-rep 5``). ``--min-rep`` and ``--max-rep`` are used together
to specify a range of replicas (for example, ``--min-rep 1 --max-rep 10``).
.. option:: --show-bad-mappings
Displays which value failed to be mapped to the required number of

View File

@ -825,7 +825,8 @@ Per mapping (block device) `rbd device map` options:
* alloc_size - Minimum allocation unit of the underlying OSD object store
backend (since 5.1, default is 64K bytes). This is used to round off and
drop discards that are too small. For bluestore, the recommended setting is
bluestore_min_alloc_size (typically 64K for hard disk drives and 16K for
bluestore_min_alloc_size (currently set to 4K for all types of drives,
previously used to be set to 64K for hard disk drives and 16K for
solid-state drives). For filestore with filestore_punch_hole = false, the
recommended setting is image object size (typically 4M).

File diff suppressed because it is too large Load Diff

View File

@ -4,13 +4,30 @@ Debug
^^^^^
This plugin allows to customize the behaviour of the dashboard according to the
debug mode. It can be enabled, disabled or checked with the following command::
debug mode. It can be enabled, disabled or checked with the following command:
.. prompt:: bash $
ceph dashboard debug status
::
$ ceph dashboard debug status
Debug: 'disabled'
$ ceph dashboard debug enable
.. prompt:: bash $
ceph dashboard debug enable
::
Debug: 'enabled'
$ ceph dashboard debug disable
.. prompt:: bash $
ceph dashboard debug disable
::
Debug: 'disabled'
By default, it's disabled. This is the recommended setting for production

View File

@ -25,9 +25,14 @@ The list of features that can be enabled/disabled is:
By default all features come enabled.
To retrieve a list of features and their current statuses::
To retrieve a list of features and their current statuses:
.. prompt:: bash $
ceph dashboard feature status
::
$ ceph dashboard feature status
Feature 'cephfs': 'enabled'
Feature 'iscsi': 'enabled'
Feature 'mirroring': 'enabled'
@ -35,9 +40,14 @@ To retrieve a list of features and their current statuses::
Feature 'rgw': 'enabled'
Feature 'nfs': 'enabled'
To enable or disable the status of a single or multiple features::
To enable or disable the status of a single or multiple features:
.. prompt:: bash $
ceph dashboard feature disable iscsi mirroring
::
$ ceph dashboard feature disable iscsi mirroring
Feature 'iscsi': disabled
Feature 'mirroring': disabled

View File

@ -12,17 +12,23 @@ syntax to specify the expiration time: `Ns|m|h|d|w` for seconds, minutes,
hours, days and weeks. If the MOTD should expire after 2 hours, use `2h`
or `5w` for 5 weeks. Use `0` to configure a MOTD that does not expire.
To configure a MOTD, run the following command::
To configure a MOTD, run the following command:
$ ceph dashboard motd set <severity:info|warning|danger> <expires> <message>
.. prompt:: bash $
To show the configured MOTD::
ceph dashboard motd set <severity:info|warning|danger> <expires> <message>
$ ceph dashboard motd get
To show the configured MOTD:
To clear the configured MOTD run::
.. prompt:: bash $
$ ceph dashboard motd clear
ceph dashboard motd get
To clear the configured MOTD run:
.. prompt:: bash $
ceph dashboard motd clear
A MOTD with a `info` or `warning` severity can be closed by the user. The
`info` MOTD is not displayed anymore until the local storage cookies are

View File

@ -5,14 +5,18 @@
Orchestrator CLI
================
This module provides a command line interface (CLI) to orchestrator
modules (``ceph-mgr`` modules which interface with external orchestration services).
This module provides a command line interface (CLI) for orchestrator modules.
Orchestrator modules are ``ceph-mgr`` plugins that interface with external
orchestration services.
As the orchestrator CLI unifies multiple external orchestrators, a common nomenclature
for the orchestrator module is needed.
Definition of Terms
===================
The orchestrator CLI unifies multiple external orchestrators, so we need a
common nomenclature for the orchestrator module:
+--------------------------------------+---------------------------------------+
| *host* | hostname (not DNS name) of the |
| *host* | hostname (not the DNS name) of the |
| | physical host. Not the podname, |
| | container name, or hostname inside |
| | the container. |
@ -20,7 +24,7 @@ for the orchestrator module is needed.
| *service type* | The type of the service. e.g., nfs, |
| | mds, osd, mon, rgw, mgr, iscsi |
+--------------------------------------+---------------------------------------+
| *service* | A logical service, Typically |
| *service* | A logical service. Typically |
| | comprised of multiple service |
| | instances on multiple hosts for HA |
| | |
@ -34,29 +38,28 @@ for the orchestrator module is needed.
| | like LIO or knfsd or whatever) |
| | |
| | This identifier should |
| | uniquely identify the instance |
| | uniquely identify the instance. |
+--------------------------------------+---------------------------------------+
The relation between the names is the following:
* A *service* has a specific *service type*
* A *daemon* is a physical instance of a *service type*
Here is how the names relate:
* A *service* has a specific *service type*.
* A *daemon* is a physical instance of a *service type*.
.. note::
Orchestrator modules may only implement a subset of the commands listed below.
Also, the implementation of the commands may differ between modules.
Orchestrator modules might implement only a subset of the commands listed
below. The implementation of the commands may differ between modules.
Status
======
::
.. prompt:: bash $
ceph orch status [--detail]
ceph orch status [--detail]
Show current orchestrator mode and high-level status (whether the orchestrator
plugin is available and operational)
This command shows the current orchestrator mode and its high-level status
(whether the orchestrator plugin is available and operational).
..
@ -92,15 +95,20 @@ plugin is available and operational)
Stateless services (MDS/RGW/NFS/rbd-mirror/iSCSI)
=================================================
(Please note: The orchestrator will not configure the services. Please look into the corresponding
documentation for service configuration details.)
.. note::
The ``name`` parameter is an identifier of the group of instances:
The orchestrator will not configure the services. See the relevant
documentation for details about how to configure particular services.
* a CephFS file system for a group of MDS daemons,
* a zone name for a group of RGWs
The ``name`` parameter identifies the kind of the group of instances. The
following short list explains the meaning of the ``name`` parameter:
Creating/growing/shrinking/removing services::
* A CephFS file system identifies a group of MDS daemons.
* A zone name identifies a group of RGWs.
Creating/growing/shrinking/removing services:
.. prompt:: bash $
ceph orch apply mds <fs_name> [--placement=<placement>] [--dry-run]
ceph orch apply rgw <name> [--realm=<realm>] [--zone=<zone>] [--port=<port>] [--ssl] [--placement=<placement>] [--dry-run]
@ -111,33 +119,73 @@ where ``placement`` is a :ref:`orchestrator-cli-placement-spec`.
e.g., ``ceph orch apply mds myfs --placement="3 host1 host2 host3"``
Service Commands::
Service Commands:
.. prompt:: bash $
ceph orch <start|stop|restart|redeploy|reconfig> <service_name>
.. note:: These commands apply only to cephadm containerized daemons.
Options
=======
.. option:: start
Start the daemon on the corresponding host.
.. option:: stop
Stop the daemon on the corresponding host.
.. option:: restart
Restart the daemon on the corresponding host.
.. option:: redeploy
Redeploy the ceph daemon on the corresponding host. This will recreate the daemon directory
structure under ``/var/lib/ceph/<fsid>/<daemon-name>`` (if it doesn't exist), refresh its
configuration files, regenerate its unit-files and restarts the systemd daemon.
.. option:: reconfig
Reconfigure the daemon on the corresponding host. This will refresh configuration files then restart the daemon.
.. note:: this command assumes the daemon directory ``/var/lib/ceph/<fsid>/<daemon-name>`` already exists.
Configuring the Orchestrator CLI
================================
To enable the orchestrator, select the orchestrator module to use
with the ``set backend`` command::
Enable the orchestrator by using the ``set backend`` command to select the orchestrator module that will be used:
.. prompt:: bash $
ceph orch set backend <module>
For example, to enable the Rook orchestrator module and use it with the CLI::
Example - Configuring the Orchestrator CLI
------------------------------------------
For example, to enable the Rook orchestrator module and use it with the CLI:
.. prompt:: bash $
ceph mgr module enable rook
ceph orch set backend rook
Check the backend is properly configured::
Confirm that the backend is properly configured:
.. prompt:: bash $
ceph orch status
Disable the Orchestrator
------------------------
To disable the orchestrator, use the empty string ``""``::
To disable the orchestrator, use the empty string ``""``:
.. prompt:: bash $
ceph orch set backend ""
ceph mgr module disable rook

View File

@ -4,7 +4,7 @@
The :term:`Ceph Storage Cluster` has a messaging layer protocol that enables
clients to interact with a :term:`Ceph Monitor` and a :term:`Ceph OSD Daemon`.
``librados`` provides this functionality to :term:`Ceph Clients` in the form of
``librados`` provides this functionality to :term:`Ceph Client`\s in the form of
a library. All Ceph Clients either use ``librados`` or the same functionality
encapsulated in ``librados`` to interact with the object store. For example,
``librbd`` and ``libcephfs`` leverage this functionality. You may use

View File

@ -43,19 +43,25 @@ Getting librados for C/C++
--------------------------
To install ``librados`` development support files for C/C++ on Debian/Ubuntu
distributions, execute the following::
distributions, execute the following:
sudo apt-get install librados-dev
.. prompt:: bash $
sudo apt-get install librados-dev
To install ``librados`` development support files for C/C++ on RHEL/CentOS
distributions, execute the following::
distributions, execute the following:
sudo yum install librados2-devel
.. prompt:: bash $
sudo yum install librados2-devel
Once you install ``librados`` for developers, you can find the required
headers for C/C++ under ``/usr/include/rados``. ::
headers for C/C++ under ``/usr/include/rados``:
ls /usr/include/rados
.. prompt:: bash $
ls /usr/include/rados
Getting librados for Python
@ -68,14 +74,25 @@ and the ``librados2-devel`` package for RHEL/CentOS will install the
directly too.
To install ``librados`` development support files for Python on Debian/Ubuntu
distributions, execute the following::
distributions, execute the following:
sudo apt-get install python-rados
.. prompt:: bash $
sudo apt-get install python3-rados
To install ``librados`` development support files for Python on RHEL/CentOS
distributions, execute the following::
distributions, execute the following:
sudo yum install python-rados
.. prompt:: bash $
sudo yum install python-rados
To install ``librados`` development support files for Python on SLE/openSUSE
distributions, execute the following:
.. prompt:: bash $
sudo zypper install python3-rados
You can find the module under ``/usr/share/pyshared`` on Debian systems,
or under ``/usr/lib/python*/site-packages`` on CentOS/RHEL systems.
@ -86,37 +103,49 @@ Getting librados for Java
To install ``librados`` for Java, you need to execute the following procedure:
#. Install ``jna.jar``. For Debian/Ubuntu, execute::
#. Install ``jna.jar``. For Debian/Ubuntu, execute:
sudo apt-get install libjna-java
.. prompt:: bash $
For CentOS/RHEL, execute::
sudo apt-get install libjna-java
sudo yum install jna
For CentOS/RHEL, execute:
.. prompt:: bash $
sudo yum install jna
The JAR files are located in ``/usr/share/java``.
#. Clone the ``rados-java`` repository::
#. Clone the ``rados-java`` repository:
git clone --recursive https://github.com/ceph/rados-java.git
.. prompt:: bash $
#. Build the ``rados-java`` repository::
git clone --recursive https://github.com/ceph/rados-java.git
cd rados-java
ant
#. Build the ``rados-java`` repository:
.. prompt:: bash $
cd rados-java
ant
The JAR file is located under ``rados-java/target``.
#. Copy the JAR for RADOS to a common location (e.g., ``/usr/share/java``) and
ensure that it and the JNA JAR are in your JVM's classpath. For example::
ensure that it and the JNA JAR are in your JVM's classpath. For example:
sudo cp target/rados-0.1.3.jar /usr/share/java/rados-0.1.3.jar
sudo ln -s /usr/share/java/jna-3.2.7.jar /usr/lib/jvm/default-java/jre/lib/ext/jna-3.2.7.jar
sudo ln -s /usr/share/java/rados-0.1.3.jar /usr/lib/jvm/default-java/jre/lib/ext/rados-0.1.3.jar
.. prompt:: bash $
To build the documentation, execute the following::
sudo cp target/rados-0.1.3.jar /usr/share/java/rados-0.1.3.jar
sudo ln -s /usr/share/java/jna-3.2.7.jar /usr/lib/jvm/default-java/jre/lib/ext/jna-3.2.7.jar
sudo ln -s /usr/share/java/rados-0.1.3.jar /usr/lib/jvm/default-java/jre/lib/ext/rados-0.1.3.jar
ant docs
To build the documentation, execute the following:
.. prompt:: bash $
ant docs
Getting librados for PHP
@ -124,29 +153,37 @@ Getting librados for PHP
To install the ``librados`` extension for PHP, you need to execute the following procedure:
#. Install php-dev. For Debian/Ubuntu, execute::
#. Install php-dev. For Debian/Ubuntu, execute:
sudo apt-get install php5-dev build-essential
.. prompt:: bash $
For CentOS/RHEL, execute::
sudo apt-get install php5-dev build-essential
sudo yum install php-devel
For CentOS/RHEL, execute:
#. Clone the ``phprados`` repository::
.. prompt:: bash $
git clone https://github.com/ceph/phprados.git
sudo yum install php-devel
#. Build ``phprados``::
#. Clone the ``phprados`` repository:
cd phprados
phpize
./configure
make
sudo make install
.. prompt:: bash $
#. Enable ``phprados`` in php.ini by adding::
git clone https://github.com/ceph/phprados.git
extension=rados.so
#. Build ``phprados``:
.. prompt:: bash $
cd phprados
phpize
./configure
make
sudo make install
#. Enable ``phprados`` by adding the following line to ``php.ini``::
extension=rados.so
Step 2: Configuring a Cluster Handle
@ -321,9 +358,11 @@ it and connecting to the cluster might look something like this:
}
Compile your client and link to ``librados`` using ``-lrados``. For example::
Compile your client and link to ``librados`` using ``-lrados``. For example:
gcc ceph-client.c -lrados -o ceph-client
.. prompt:: bash $
gcc ceph-client.c -lrados -o ceph-client
C++ Example
@ -399,10 +438,12 @@ you to initialize a ``librados::Rados`` cluster handle object:
Compile the source; then, link ``librados`` using ``-lrados``.
For example::
For example:
g++ -g -c ceph-client.cc -o ceph-client.o
g++ -g ceph-client.o -lrados -o ceph-client
.. prompt:: bash $
g++ -g -c ceph-client.cc -o ceph-client.o
g++ -g ceph-client.o -lrados -o ceph-client
@ -436,9 +477,11 @@ into exceptions.
print "Connected to the cluster."
Execute the example to verify that it connects to your cluster. ::
Execute the example to verify that it connects to your cluster:
python ceph-client.py
.. prompt:: bash $
python ceph-client.py
Java Example
@ -478,10 +521,12 @@ binding converts C++-based errors into exceptions.
Compile the source; then, run it. If you have copied the JAR to
``/usr/share/java`` and sym linked from your ``ext`` directory, you won't need
to specify the classpath. For example::
to specify the classpath. For example:
javac CephClient.java
java CephClient
.. prompt:: bash $
javac CephClient.java
java CephClient
PHP Example
@ -502,9 +547,11 @@ With the RADOS extension enabled in PHP you can start creating a new cluster han
}
Save this as rados.php and run the code::
Save this as rados.php and run the code:
php rados.php
.. prompt:: bash $
php rados.php
Step 3: Creating an I/O Context

View File

@ -68,7 +68,7 @@ Your Python client also requires a client keyring. For this example, we use the
``client.admin`` key by default. If you would like to specify the keyring when
creating the cluster handle, you may use the ``conf`` argument. Alternatively,
you may specify the keyring path in your Ceph configuration file. For example,
you may add something like the following line to you Ceph configuration file::
you may add something like the following line to your Ceph configuration file::
keyring = /path/to/ceph.client.admin.keyring

View File

@ -195,9 +195,11 @@ specify a ``keyring`` entry in your Ceph configuration file.
We recommend copying the Ceph Storage Cluster's keyring file to nodes where you
will run administrative commands, because it contains the ``client.admin`` key.
To perform this step manually, execute the following::
To perform this step manually, execute the following:
sudo scp {user}@{ceph-cluster-host}:/etc/ceph/ceph.client.admin.keyring /etc/ceph/ceph.client.admin.keyring
.. prompt:: bash $
sudo scp {user}@{ceph-cluster-host}:/etc/ceph/ceph.client.admin.keyring /etc/ceph/ceph.client.admin.keyring
.. tip:: Ensure the ``ceph.keyring`` file has appropriate permissions set
(e.g., ``chmod 644``) on your client machine.

View File

@ -42,13 +42,17 @@ it will fit). This means that if a DB device is specified but an explicit
WAL device is not, the WAL will be implicitly colocated with the DB on the faster
device.
A single-device (colocated) BlueStore OSD can be provisioned with::
A single-device (colocated) BlueStore OSD can be provisioned with:
ceph-volume lvm prepare --bluestore --data <device>
.. prompt:: bash $
To specify a WAL device and/or DB device, ::
ceph-volume lvm prepare --bluestore --data <device>
ceph-volume lvm prepare --bluestore --data <device> --block.wal <wal-device> --block.db <db-device>
To specify a WAL device and/or DB device:
.. prompt:: bash $
ceph-volume lvm prepare --bluestore --data <device> --block.wal <wal-device> --block.db <db-device>
.. note:: ``--data`` can be a Logical Volume using *vg/lv* notation. Other
devices can be existing logical volumes or GPT partitions.
@ -64,17 +68,21 @@ the deployment strategy:
**block (data) only**
^^^^^^^^^^^^^^^^^^^^^
If all devices are the same type, for example all rotational drives, and
there are no fast devices to use for metadata, it makes sense to specifiy the
there are no fast devices to use for metadata, it makes sense to specify the
block device only and to not separate ``block.db`` or ``block.wal``. The
:ref:`ceph-volume-lvm` command for a single ``/dev/sda`` device looks like::
:ref:`ceph-volume-lvm` command for a single ``/dev/sda`` device looks like:
ceph-volume lvm create --bluestore --data /dev/sda
.. prompt:: bash $
ceph-volume lvm create --bluestore --data /dev/sda
If logical volumes have already been created for each device, (a single LV
using 100% of the device), then the :ref:`ceph-volume-lvm` call for an LV named
``ceph-vg/block-lv`` would look like::
``ceph-vg/block-lv`` would look like:
ceph-volume lvm create --bluestore --data ceph-vg/block-lv
.. prompt:: bash $
ceph-volume lvm create --bluestore --data ceph-vg/block-lv
.. _bluestore-mixed-device-config:
@ -88,35 +96,43 @@ You must create these volume groups and logical volumes manually as
the ``ceph-volume`` tool is currently not able to do so automatically.
For the below example, let us assume four rotational (``sda``, ``sdb``, ``sdc``, and ``sdd``)
and one (fast) solid state drive (``sdx``). First create the volume groups::
and one (fast) solid state drive (``sdx``). First create the volume groups:
$ vgcreate ceph-block-0 /dev/sda
$ vgcreate ceph-block-1 /dev/sdb
$ vgcreate ceph-block-2 /dev/sdc
$ vgcreate ceph-block-3 /dev/sdd
.. prompt:: bash $
Now create the logical volumes for ``block``::
vgcreate ceph-block-0 /dev/sda
vgcreate ceph-block-1 /dev/sdb
vgcreate ceph-block-2 /dev/sdc
vgcreate ceph-block-3 /dev/sdd
$ lvcreate -l 100%FREE -n block-0 ceph-block-0
$ lvcreate -l 100%FREE -n block-1 ceph-block-1
$ lvcreate -l 100%FREE -n block-2 ceph-block-2
$ lvcreate -l 100%FREE -n block-3 ceph-block-3
Now create the logical volumes for ``block``:
.. prompt:: bash $
lvcreate -l 100%FREE -n block-0 ceph-block-0
lvcreate -l 100%FREE -n block-1 ceph-block-1
lvcreate -l 100%FREE -n block-2 ceph-block-2
lvcreate -l 100%FREE -n block-3 ceph-block-3
We are creating 4 OSDs for the four slow spinning devices, so assuming a 200GB
SSD in ``/dev/sdx`` we will create 4 logical volumes, each of 50GB::
SSD in ``/dev/sdx`` we will create 4 logical volumes, each of 50GB:
$ vgcreate ceph-db-0 /dev/sdx
$ lvcreate -L 50GB -n db-0 ceph-db-0
$ lvcreate -L 50GB -n db-1 ceph-db-0
$ lvcreate -L 50GB -n db-2 ceph-db-0
$ lvcreate -L 50GB -n db-3 ceph-db-0
.. prompt:: bash $
Finally, create the 4 OSDs with ``ceph-volume``::
vgcreate ceph-db-0 /dev/sdx
lvcreate -L 50GB -n db-0 ceph-db-0
lvcreate -L 50GB -n db-1 ceph-db-0
lvcreate -L 50GB -n db-2 ceph-db-0
lvcreate -L 50GB -n db-3 ceph-db-0
$ ceph-volume lvm create --bluestore --data ceph-block-0/block-0 --block.db ceph-db-0/db-0
$ ceph-volume lvm create --bluestore --data ceph-block-1/block-1 --block.db ceph-db-0/db-1
$ ceph-volume lvm create --bluestore --data ceph-block-2/block-2 --block.db ceph-db-0/db-2
$ ceph-volume lvm create --bluestore --data ceph-block-3/block-3 --block.db ceph-db-0/db-3
Finally, create the 4 OSDs with ``ceph-volume``:
.. prompt:: bash $
ceph-volume lvm create --bluestore --data ceph-block-0/block-0 --block.db ceph-db-0/db-0
ceph-volume lvm create --bluestore --data ceph-block-1/block-1 --block.db ceph-db-0/db-1
ceph-volume lvm create --bluestore --data ceph-block-2/block-2 --block.db ceph-db-0/db-2
ceph-volume lvm create --bluestore --data ceph-block-3/block-3 --block.db ceph-db-0/db-3
These operations should end up creating four OSDs, with ``block`` on the slower
rotational drives with a 50 GB logical volume (DB) for each on the solid state
@ -139,7 +155,7 @@ In older releases, internal level sizes mean that the DB can fully utilize only
specific partition / LV sizes that correspond to sums of L0, L0+L1, L1+L2,
etc. sizes, which with default settings means roughly 3 GB, 30 GB, 300 GB, and
so forth. Most deployments will not substantially benefit from sizing to
accomodate L3 and higher, though DB compaction can be facilitated by doubling
accommodate L3 and higher, though DB compaction can be facilitated by doubling
these figures to 6GB, 60GB, and 600GB.
Improvements in releases beginning with Nautilus 14.2.12 and Octopus 15.2.6
@ -167,93 +183,6 @@ of priorities. If priority information is not available, the
``bluestore_cache_meta_ratio`` and ``bluestore_cache_kv_ratio`` options are
used as fallbacks.
``bluestore_cache_autotune``
:Description: Automatically tune the space ratios assigned to various BlueStore
caches while respecting minimum values.
:Type: Boolean
:Required: Yes
:Default: ``True``
``osd_memory_target``
:Description: When TCMalloc is available and cache autotuning is enabled, try to
keep this many bytes mapped in memory. Note: This may not exactly
match the RSS memory usage of the process. While the total amount
of heap memory mapped by the process should usually be close
to this target, there is no guarantee that the kernel will actually
reclaim memory that has been unmapped. During initial development,
it was found that some kernels result in the OSD's RSS memory
exceeding the mapped memory by up to 20%. It is hypothesised
however, that the kernel generally may be more aggressive about
reclaiming unmapped memory when there is a high amount of memory
pressure. Your mileage may vary.
:Type: Unsigned Integer
:Required: Yes
:Default: ``4294967296``
``bluestore_cache_autotune_chunk_size``
:Description: The chunk size in bytes to allocate to caches when cache autotune
is enabled. When the autotuner assigns memory to various caches,
it will allocate memory in chunks. This is done to avoid
evictions when there are minor fluctuations in the heap size or
autotuned cache ratios.
:Type: Unsigned Integer
:Required: No
:Default: ``33554432``
``bluestore_cache_autotune_interval``
:Description: The number of seconds to wait between rebalances when cache autotune
is enabled. This setting changes how quickly the allocation ratios of
various caches are recomputed. Note: Setting this interval too small
can result in high CPU usage and lower performance.
:Type: Float
:Required: No
:Default: ``5``
``osd_memory_base``
:Description: When TCMalloc and cache autotuning are enabled, estimate the minimum
amount of memory in bytes the OSD will need. This is used to help
the autotuner estimate the expected aggregate memory consumption of
the caches.
:Type: Unsigned Integer
:Required: No
:Default: ``805306368``
``osd_memory_expected_fragmentation``
:Description: When TCMalloc and cache autotuning is enabled, estimate the
percentage of memory fragmentation. This is used to help the
autotuner estimate the expected aggregate memory consumption
of the caches.
:Type: Float
:Required: No
:Default: ``0.15``
``osd_memory_cache_min``
:Description: When TCMalloc and cache autotuning are enabled, set the minimum
amount of memory used for caches. Note: Setting this value too
low can result in significant cache thrashing.
:Type: Unsigned Integer
:Required: No
:Default: ``134217728``
``osd_memory_cache_resize_interval``
:Description: When TCMalloc and cache autotuning are enabled, wait this many
seconds between resizing caches. This setting changes the total
amount of memory available for BlueStore to use for caching. Note
that setting this interval too small can result in memory allocator
thrashing and lower performance.
:Type: Float
:Required: No
:Default: ``1``
Manual Cache Sizing
===================
@ -286,53 +215,6 @@ device) as well as the meta and kv ratios.
The data fraction can be calculated by
``<effective_cache_size> * (1 - bluestore_cache_meta_ratio - bluestore_cache_kv_ratio)``
``bluestore_cache_size``
:Description: The amount of memory BlueStore will use for its cache. If zero,
``bluestore_cache_size_hdd`` or ``bluestore_cache_size_ssd`` will
be used instead.
:Type: Unsigned Integer
:Required: Yes
:Default: ``0``
``bluestore_cache_size_hdd``
:Description: The default amount of memory BlueStore will use for its cache when
backed by an HDD.
:Type: Unsigned Integer
:Required: Yes
:Default: ``1 * 1024 * 1024 * 1024`` (1 GB)
``bluestore_cache_size_ssd``
:Description: The default amount of memory BlueStore will use for its cache when
backed by an SSD.
:Type: Unsigned Integer
:Required: Yes
:Default: ``3 * 1024 * 1024 * 1024`` (3 GB)
``bluestore_cache_meta_ratio``
:Description: The ratio of cache devoted to metadata.
:Type: Floating point
:Required: Yes
:Default: ``.4``
``bluestore_cache_kv_ratio``
:Description: The ratio of cache devoted to key/value data (RocksDB).
:Type: Floating point
:Required: Yes
:Default: ``.4``
``bluestore_cache_kv_max``
:Description: The maximum amount of cache devoted to key/value data (RocksDB).
:Type: Unsigned Integer
:Required: Yes
:Default: ``512 * 1024*1024`` (512 MB)
Checksums
=========
@ -358,18 +240,11 @@ The smaller checksum values can be used by selecting `crc32c_16` or
`crc32c_8` as the checksum algorithm.
The *checksum algorithm* can be set either via a per-pool
``csum_type`` property or the global config option. For example, ::
``csum_type`` property or the global config option. For example:
ceph osd pool set <pool-name> csum_type <algorithm>
``bluestore_csum_type``
:Description: The default checksum algorithm to use.
:Type: String
:Required: Yes
:Valid Settings: ``none``, ``crc32c``, ``crc32c_16``, ``crc32c_8``, ``xxhash32``, ``xxhash64``
:Default: ``crc32c``
.. prompt:: bash $
ceph osd pool set <pool-name> csum_type <algorithm>
Inline Compression
==================
@ -401,107 +276,47 @@ must be 70% of the size of the original (or smaller).
The *compression mode*, *compression algorithm*, *compression required
ratio*, *min blob size*, and *max blob size* can be set either via a
per-pool property or a global config option. Pool properties can be
set with::
set with:
ceph osd pool set <pool-name> compression_algorithm <algorithm>
ceph osd pool set <pool-name> compression_mode <mode>
ceph osd pool set <pool-name> compression_required_ratio <ratio>
ceph osd pool set <pool-name> compression_min_blob_size <size>
ceph osd pool set <pool-name> compression_max_blob_size <size>
.. prompt:: bash $
``bluestore_compression_algorithm``
ceph osd pool set <pool-name> compression_algorithm <algorithm>
ceph osd pool set <pool-name> compression_mode <mode>
ceph osd pool set <pool-name> compression_required_ratio <ratio>
ceph osd pool set <pool-name> compression_min_blob_size <size>
ceph osd pool set <pool-name> compression_max_blob_size <size>
:Description: The default compressor to use (if any) if the per-pool property
``compression_algorithm`` is not set. Note that ``zstd`` is *not*
recommended for BlueStore due to high CPU overhead when
compressing small amounts of data.
:Type: String
:Required: No
:Valid Settings: ``lz4``, ``snappy``, ``zlib``, ``zstd``
:Default: ``snappy``
.. _bluestore-rocksdb-sharding:
``bluestore_compression_mode``
RocksDB Sharding
================
:Description: The default policy for using compression if the per-pool property
``compression_mode`` is not set. ``none`` means never use
compression. ``passive`` means use compression when
:c:func:`clients hint <rados_set_alloc_hint>` that data is
compressible. ``aggressive`` means use compression unless
clients hint that data is not compressible. ``force`` means use
compression under all circumstances even if the clients hint that
the data is not compressible.
:Type: String
:Required: No
:Valid Settings: ``none``, ``passive``, ``aggressive``, ``force``
:Default: ``none``
Internally BlueStore uses multiple types of key-value data,
stored in RocksDB. Each data type in BlueStore is assigned a
unique prefix. Until Pacific all key-value data was stored in
single RocksDB column family: 'default'. Since Pacific,
BlueStore can divide this data into multiple RocksDB column
families. When keys have similar access frequency, modification
frequency and lifetime, BlueStore benefits from better caching
and more precise compaction. This improves performance, and also
requires less disk space during compaction, since each column
family is smaller and can compact independent of others.
``bluestore_compression_required_ratio``
OSDs deployed in Pacific or later use RocksDB sharding by default.
If Ceph is upgraded to Pacific from a previous version, sharding is off.
:Description: The ratio of the size of the data chunk after
compression relative to the original size must be at
least this small in order to store the compressed
version.
To enable sharding and apply the Pacific defaults, stop an OSD and run
:Type: Floating point
:Required: No
:Default: .875
.. prompt:: bash #
``bluestore_compression_min_blob_size``
ceph-bluestore-tool \
--path <data path> \
--sharding="m(3) p(3,0-12) O(3,0-13)=block_cache={type=binned_lru} L P" \
reshard
:Description: Chunks smaller than this are never compressed.
The per-pool property ``compression_min_blob_size`` overrides
this setting.
:Type: Unsigned Integer
:Required: No
:Default: 0
``bluestore_compression_min_blob_size_hdd``
:Description: Default value of ``bluestore compression min blob size``
for rotational media.
:Type: Unsigned Integer
:Required: No
:Default: 128K
``bluestore_compression_min_blob_size_ssd``
:Description: Default value of ``bluestore compression min blob size``
for non-rotational (solid state) media.
:Type: Unsigned Integer
:Required: No
:Default: 8K
``bluestore_compression_max_blob_size``
:Description: Chunks larger than this value are broken into smaller blobs of at most
``bluestore_compression_max_blob_size`` bytes before being compressed.
The per-pool property ``compression_max_blob_size`` overrides
this setting.
:Type: Unsigned Integer
:Required: No
:Default: 0
``bluestore_compression_max_blob_size_hdd``
:Description: Default value of ``bluestore compression max blob size``
for rotational media.
:Type: Unsigned Integer
:Required: No
:Default: 512K
``bluestore_compression_max_blob_size_ssd``
:Description: Default value of ``bluestore compression max blob size``
for non-rotational (SSD, NVMe) media.
:Type: Unsigned Integer
:Required: No
:Default: 64K
Throttling
==========
SPDK Usage
==================
@ -512,29 +327,38 @@ Refer to `SPDK document`__ for more details.
.. __: http://www.spdk.io/doc/getting_started.html#getting_started_examples
SPDK offers a script to configure the device automatically. Users can run the
script as root::
script as root:
$ sudo src/spdk/scripts/setup.sh
.. prompt:: bash $
sudo src/spdk/scripts/setup.sh
You will need to specify the subject NVMe device's device selector with
the "spdk:" prefix for ``bluestore_block_path``.
For example, you can find the device selector of an Intel PCIe SSD with::
For example, you can find the device selector of an Intel PCIe SSD with:
$ lspci -mm -n -D -d 8086:0953
.. prompt:: bash $
lspci -mm -n -D -d 8086:0953
The device selector always has the form of ``DDDD:BB:DD.FF`` or ``DDDD.BB.DD.FF``.
and then set::
bluestore_block_path = spdk:0000:01:00.0
bluestore_block_path = "spdk:trtype:PCIe traddr:0000:01:00.0"
Where ``0000:01:00.0`` is the device selector found in the output of ``lspci``
command above.
You may also specify a remote NVMeoF target over the TCP transport as in the
following example::
bluestore_block_path = "spdk:trtype:TCP traddr:10.67.110.197 trsvcid:4420 subnqn:nqn.2019-02.io.spdk:cnode1"
To run multiple SPDK instances per node, you must specify the
amount of dpdk memory in MB that each instance will use, to make sure each
instance uses its own dpdk memory
instance uses its own DPDK memory.
In most cases, a single device can be used for data, DB, and WAL. We describe
this strategy as *colocating* these components. Be sure to enter the below
@ -547,3 +371,112 @@ settings to ensure that all IOs are issued through SPDK.::
Otherwise, the current implementation will populate the SPDK map files with
kernel file system symbols and will use the kernel driver to issue DB/WAL IO.
Minimum Allocation Size
========================
There is a configured minimum amount of storage that BlueStore will allocate on
an OSD. In practice, this is the least amount of capacity that a RADOS object
can consume. The value of `bluestore_min_alloc_size` is derived from the
value of `bluestore_min_alloc_size_hdd` or `bluestore_min_alloc_size_ssd`
depending on the OSD's ``rotational`` attribute. This means that when an OSD
is created on an HDD, BlueStore will be initialized with the current value
of `bluestore_min_alloc_size_hdd`, and SSD OSDs (including NVMe devices)
with the value of `bluestore_min_alloc_size_ssd`.
Through the Mimic release, the default values were 64KB and 16KB for rotational
(HDD) and non-rotational (SSD) media respectively. Octopus changed the default
for SSD (non-rotational) media to 4KB, and Pacific changed the default for HDD
(rotational) media to 4KB as well.
These changes were driven by space amplification experienced by Ceph RADOS
GateWay (RGW) deployments that host large numbers of small files
(S3/Swift objects).
For example, when an RGW client stores a 1KB S3 object, it is written to a
single RADOS object. With the default `min_alloc_size` value, 4KB of
underlying drive space is allocated. This means that roughly
(4KB - 1KB) == 3KB is allocated but never used, which corresponds to 300%
overhead or 25% efficiency. Similarly, a 5KB user object will be stored
as one 4KB and one 1KB RADOS object, again stranding 4KB of device capcity,
though in this case the overhead is a much smaller percentage. Think of this
in terms of the remainder from a modulus operation. The overhead *percentage*
thus decreases rapidly as user object size increases.
An easily missed additional subtlety is that this
takes place for *each* replica. So when using the default three copies of
data (3R), a 1KB S3 object actually consumes roughly 9KB of storage device
capacity. If erasure coding (EC) is used instead of replication, the
amplification may be even higher: for a ``k=4,m=2`` pool, our 1KB S3 object
will allocate (6 * 4KB) = 24KB of device capacity.
When an RGW bucket pool contains many relatively large user objects, the effect
of this phenomenon is often negligible, but should be considered for deployments
that expect a signficiant fraction of relatively small objects.
The 4KB default value aligns well with conventional HDD and SSD devices. Some
new coarse-IU (Indirection Unit) QLC SSDs however perform and wear best
when `bluestore_min_alloc_size_ssd`
is set at OSD creation to match the device's IU:. 8KB, 16KB, or even 64KB.
These novel storage drives allow one to achieve read performance competitive
with conventional TLC SSDs and write performance faster than HDDs, with
high density and lower cost than TLC SSDs.
Note that when creating OSDs on these devices, one must carefully apply the
non-default value only to appropriate devices, and not to conventional SSD and
HDD devices. This may be done through careful ordering of OSD creation, custom
OSD device classes, and especially by the use of central configuration _masks_.
Quincy and later releases add
the `bluestore_use_optimal_io_size_for_min_alloc_size`
option that enables automatic discovery of the appropriate value as each OSD is
created. Note that the use of ``bcache``, ``OpenCAS``, ``dmcrypt``,
``ATA over Ethernet``, `iSCSI`, or other device layering / abstraction
technologies may confound the determination of appropriate values. OSDs
deployed on top of VMware storage have been reported to also
sometimes report a ``rotational`` attribute that does not match the underlying
hardware.
We suggest inspecting such OSDs at startup via logs and admin sockets to ensure that
behavior is appropriate. Note that this also may not work as desired with
older kernels. You can check for this by examining the presence and value
of ``/sys/block/<drive>/queue/optimal_io_size``.
You may also inspect a given OSD:
.. prompt:: bash #
ceph osd metadata osd.1701 | grep rotational
This space amplification may manifest as an unusually high ratio of raw to
stored data reported by ``ceph df``. ``ceph osd df`` may also report
anomalously high ``%USE`` / ``VAR`` values when
compared to other, ostensibly identical OSDs. A pool using OSDs with
mismatched ``min_alloc_size`` values may experience unexpected balancer
behavior as well.
Note that this BlueStore attribute takes effect *only* at OSD creation; if
changed later, a given OSD's behavior will not change unless / until it is
destroyed and redeployed with the appropriate option value(s). Upgrading
to a later Ceph release will *not* change the value used by OSDs deployed
under older releases or with other settings.
DSA (Data Streaming Accelerator Usage)
======================================
If you want to use the DML library to drive DSA device for offloading
read/write operations on Persist memory in Bluestore. You need to install
`DML`_ and `idxd-config`_ library in your machine with SPR (Sapphire Rapids) CPU.
.. _DML: https://github.com/intel/DML
.. _idxd-config: https://github.com/intel/idxd-config
After installing the DML software, you need to configure the shared
work queues (WQs) with the following WQ configuration example via accel-config tool:
.. prompt:: bash $
accel-config config-wq --group-id=1 --mode=shared --wq-size=16 --threshold=15 --type=user --name="MyApp1" --priority=10 --block-on-fault=1 dsa0/wq0.1
accel-config config-engine dsa0/engine0.1 --group-id=1
accel-config enable-device dsa0
accel-config enable-wq dsa0/wq0.1

View File

@ -484,17 +484,26 @@ The following CLI commands are used to configure the cluster:
Help
====
You can get help for a particular option with::
You can get help for a particular option with:
ceph config help <option>
.. prompt:: bash $
Note that this will use the configuration schema that is compiled into the running monitors. If you have a mixed-version cluster (e.g., during an upgrade), you might also want to query the option schema from a specific running daemon::
ceph config help <option>
ceph daemon <name> config help [option]
Note that this will use the configuration schema that is compiled into the running monitors. If you have a mixed-version cluster (e.g., during an upgrade), you might also want to query the option schema from a specific running daemon:
For example,::
.. prompt:: bash $
ceph daemon <name> config help [option]
For example:
.. prompt:: bash $
ceph config help log_file
::
$ ceph config help log_file
log_file - path to log file
(std::string, basic)
Default (non-daemon):
@ -502,9 +511,14 @@ For example,::
Can update at runtime: false
See also: [log_to_stderr,err_to_stderr,log_to_syslog,err_to_syslog]
or::
or:
.. prompt:: bash $
ceph config help log_file -f json-pretty
::
$ ceph config help log_file -f json-pretty
{
"name": "log_file",
"type": "std::string",
@ -541,9 +555,11 @@ increasing/decreasing logging output, enabling/disabling debug
settings, and even for runtime optimization.
Generally speaking, configuration options can be updated in the usual
way via the ``ceph config set`` command. For example, do enable the debug log level on a specific OSD,::
way via the ``ceph config set`` command. For example, do enable the debug log level on a specific OSD:
ceph config set osd.123 debug_ms 20
.. prompt:: bash $
ceph config set osd.123 debug_ms 20
Note that if the same option is also customized in a local
configuration file, the monitor setting will be ignored (it has a
@ -559,28 +575,38 @@ the daemon or process restarts.
Override values can be set in two ways:
#. From any host, we can send a message to a daemon over the network with::
#. From any host, we can send a message to a daemon over the network with:
.. prompt:: bash $
ceph tell <name> config set <option> <value>
ceph tell <name> config set <option> <value>
For example,::
For example:
.. prompt:: bash $
ceph tell osd.123 config set debug_osd 20
ceph tell osd.123 config set debug_osd 20
The `tell` command can also accept a wildcard for the daemon
identifier. For example, to adjust the debug level on all OSD
daemons,::
daemons:
.. prompt:: bash $
ceph tell osd.* config set debug_osd 20
ceph tell osd.* config set debug_osd 20
#. From the host the process is running on, we can connect directly to
the process via a socket in ``/var/run/ceph`` with::
the process via a socket in ``/var/run/ceph`` with:
ceph daemon <name> config set <option> <value>
.. prompt:: bash $
For example,::
ceph daemon <name> config set <option> <value>
ceph daemon osd.4 config set debug_osd 20
For example:
.. prompt:: bash $
ceph daemon osd.4 config set debug_osd 20
Note that in the ``ceph config show`` command output these temporary
values will be shown with a source of ``override``.
@ -589,29 +615,41 @@ values will be shown with a source of ``override``.
Viewing runtime settings
========================
You can see the current options set for a running daemon with the ``ceph config show`` command. For example,::
You can see the current options set for a running daemon with the ``ceph config show`` command. For example:
ceph config show osd.0
.. prompt:: bash $
will show you the (non-default) options for that daemon. You can also look at a specific option with::
ceph config show osd.0
ceph config show osd.0 debug_osd
will show you the (non-default) options for that daemon. You can also look at a specific option with:
or view all options (even those with default values) with::
.. prompt:: bash $
ceph config show-with-defaults osd.0
ceph config show osd.0 debug_osd
You can also observe settings for a running daemon by connecting to it from the local host via the admin socket. For example,::
or view all options (even those with default values) with:
ceph daemon osd.0 config show
.. prompt:: bash $
will dump all current settings,::
ceph config show-with-defaults osd.0
ceph daemon osd.0 config diff
You can also observe settings for a running daemon by connecting to it from the local host via the admin socket. For example:
will show only non-default settings (as well as where the value came from: a config file, the monitor, an override, etc.), and::
.. prompt:: bash $
ceph daemon osd.0 config get debug_osd
ceph daemon osd.0 config show
will dump all current settings:
.. prompt:: bash $
ceph daemon osd.0 config diff
will show only non-default settings (as well as where the value came from: a config file, the monitor, an override, etc.), and:
.. prompt:: bash $
ceph daemon osd.0 config get debug_osd
will report the value of a single option.

View File

@ -2,12 +2,13 @@
Configuration
===============
Each Ceph process, daemon, or utility draws its configuration from
several sources on startup, include a local configuration, the
monitors, the command line, or environment variables. Configuration
options may be set globally such that they apply to all daemons, to
all daemons or services of a particular type, or only to a specific
daemon, process, or client.
Each Ceph process, daemon, or utility draws its configuration from several
sources on startup. Such sources can include (1) a local configuration, (2) the
monitors, (3) the command line, and (4) environment variables.
Configuration options can be set globally so that they apply (1) to all
daemons, (2) to all daemons or services of a particular type, or (3) to only a
specific daemon, process, or client.
.. raw:: html

View File

@ -1,3 +1,5 @@
.. _monitor-config-reference:
==========================
Monitor Config Reference
==========================

View File

@ -163,16 +163,21 @@ By default, ``ms_bind_msgr2`` is true starting with Nautilus 14.2.z.
However, until the monitors start using v2, only limited services will
start advertising v2 addresses.
For most users, the monitors are binding to the default legacy port ``6789`` for the v1 protocol. When this is the case, enabling v2 is as simple as::
For most users, the monitors are binding to the default legacy port ``6789``
for the v1 protocol. When this is the case, enabling v2 is as simple as:
ceph mon enable-msgr2
.. prompt:: bash $
ceph mon enable-msgr2
If the monitors are bound to non-standard ports, you will need to
specify an additional port for v2 explicitly. For example, if your
monitor ``mon.a`` binds to ``1.2.3.4:1111``, and you want to add v2 on
port ``1112``,::
port ``1112``:
ceph mon set-addrs a [v2:1.2.3.4:1112,v1:1.2.3.4:1111]
.. prompt:: bash $
ceph mon set-addrs a [v2:1.2.3.4:1112,v1:1.2.3.4:1111]
Once the monitors bind to v2, each daemon will start advertising a v2
address when it is next restarted.

View File

@ -60,7 +60,9 @@ By default, daemons `bind`_ to ports within the ``6800:7300`` range. You may
configure this range at your discretion. Before configuring your IP tables,
check the default ``iptables`` configuration.
sudo iptables -L
.. prompt:: bash $
sudo iptables -L
Some Linux distributions include rules that reject all inbound requests
except SSH from all network interfaces. For example::
@ -80,7 +82,9 @@ default. Additionally, Ceph Monitors always operate on the public
network. When you add the rule using the example below, make sure you
replace ``{iface}`` with the public network interface (e.g., ``eth0``,
``eth1``, etc.), ``{ip-address}`` with the IP address of the public
network and ``{netmask}`` with the netmask for the public network. ::
network and ``{netmask}`` with the netmask for the public network. :
.. prompt:: bash $
sudo iptables -A INPUT -i {iface} -p tcp -s {ip-address}/{netmask} --dport 6789 -j ACCEPT
@ -98,9 +102,11 @@ you replace ``{iface}`` with the public network interface (e.g., ``eth0``,
``eth1``, etc.), ``{ip-address}`` with the IP address of the public network
and ``{netmask}`` with the netmask of the public network.
For example::
For example:
sudo iptables -A INPUT -i {iface} -m multiport -p tcp -s {ip-address}/{netmask} --dports 6800:7300 -j ACCEPT
.. prompt:: bash $
sudo iptables -A INPUT -i {iface} -m multiport -p tcp -s {ip-address}/{netmask} --dports 6800:7300 -j ACCEPT
OSD IP Tables
@ -139,9 +145,11 @@ the public network and other Ceph OSD Daemons will connect using the cluster
network. When you add the rule using the example below, make sure you replace
``{iface}`` with the network interface (e.g., ``eth0``, ``eth1``, etc.),
``{ip-address}`` with the IP address and ``{netmask}`` with the netmask of the
public or cluster network. For example::
public or cluster network. For example:
sudo iptables -A INPUT -i {iface} -m multiport -p tcp -s {ip-address}/{netmask} --dports 6800:7300 -j ACCEPT
.. prompt:: bash $
sudo iptables -A INPUT -i {iface} -m multiport -p tcp -s {ip-address}/{netmask} --dports 6800:7300 -j ACCEPT
.. tip:: If you run Ceph Metadata Servers on the same Ceph Node as the
Ceph OSD Daemons, you can consolidate the public network configuration step.

View File

@ -4,37 +4,50 @@
There are two Ceph daemons that store data on devices:
* **Ceph OSDs** (or Object Storage Daemons) are where most of the
data is stored in Ceph. Generally speaking, each OSD is backed by
a single storage device, like a traditional hard disk (HDD) or
solid state disk (SSD). OSDs can also be backed by a combination
of devices, like a HDD for most data and an SSD (or partition of an
SSD) for some metadata. The number of OSDs in a cluster is
generally a function of how much data will be stored, how big each
storage device will be, and the level and type of redundancy
(replication or erasure coding).
* **Ceph Monitor** daemons manage critical cluster state like cluster
membership and authentication information. For smaller clusters a
few gigabytes is all that is needed, although for larger clusters
the monitor database can reach tens or possibly hundreds of
gigabytes.
.. _rados_configuration_storage-devices_ceph_osd:
* **Ceph OSDs** (Object Storage Daemons) store most of the data
in Ceph. Usually each OSD is backed by a single storage device.
This can be a traditional hard disk (HDD) or a solid state disk
(SSD). OSDs can also be backed by a combination of devices: for
example, a HDD for most data and an SSD (or partition of an
SSD) for some metadata. The number of OSDs in a cluster is
usually a function of the amount of data to be stored, the size
of each storage device, and the level and type of redundancy
specified (replication or erasure coding).
* **Ceph Monitor** daemons manage critical cluster state. This
includes cluster membership and authentication information.
Small clusters require only a few gigabytes of storage to hold
the monitor database. In large clusters, however, the monitor
database can reach sizes of tens of gigabytes to hundreds of
gigabytes.
* **Ceph Manager** daemons run alongside monitor daemons, providing
additional monitoring and providing interfaces to external
monitoring and management systems.
OSD Backends
============
OSD Back Ends
=============
There are two ways that OSDs can manage the data they store. Starting
with the Luminous 12.2.z release, the new default (and recommended) backend is
*BlueStore*. Prior to Luminous, the default (and only option) was
*Filestore*.
There are two ways that OSDs manage the data they store. As of the Luminous
12.2.z release, the default (and recommended) back end is *BlueStore*. Prior
to the Luminous release, the default (and only) back end was *Filestore*.
.. _rados_config_storage_devices_bluestore:
BlueStore
---------
<<<<<<< HEAD
BlueStore is a special-purpose storage backend designed specifically
for managing data on disk for Ceph OSD workloads. It is motivated by
experience supporting and managing OSDs using FileStore over the
last ten years. Key BlueStore features include:
=======
BlueStore is a special-purpose storage back end designed specifically for
managing data on disk for Ceph OSD workloads. BlueStore's design is based on
a decade of experience of supporting and managing Filestore OSDs.
>>>>>>> 28abc6a9a59 (doc/rados: s/backend/back end/)
* Direct management of storage devices. BlueStore consumes raw block
devices or partitions. This avoids any intervening layers of

View File

@ -95,7 +95,9 @@ without the ``mon.`` prefix (i.e., ``{mon-id}`` should be the ``a``
on ``mon.a``).
#. Create the default directory on the machine that will host your
new monitor. ::
new monitor:
.. prompt:: bash $
ssh {new-mon-host}
sudo mkdir /var/lib/ceph/mon/ceph-{mon-id}
@ -103,36 +105,46 @@ on ``mon.a``).
#. Create a temporary directory ``{tmp}`` to keep the files needed during
this process. This directory should be different from the monitor's default
directory created in the previous step, and can be removed after all the
steps are executed. ::
steps are executed:
.. prompt:: bash $
mkdir {tmp}
#. Retrieve the keyring for your monitors, where ``{tmp}`` is the path to
the retrieved keyring, and ``{key-filename}`` is the name of the file
containing the retrieved monitor key. ::
containing the retrieved monitor key:
ceph auth get mon. -o {tmp}/{key-filename}
.. prompt:: bash $
ceph auth get mon. -o {tmp}/{key-filename}
#. Retrieve the monitor map, where ``{tmp}`` is the path to
the retrieved monitor map, and ``{map-filename}`` is the name of the file
containing the retrieved monitor map. ::
containing the retrieved monitor map:
ceph mon getmap -o {tmp}/{map-filename}
.. prompt:: bash $
ceph mon getmap -o {tmp}/{map-filename}
#. Prepare the monitor's data directory created in the first step. You must
specify the path to the monitor map so that you can retrieve the
information about a quorum of monitors and their ``fsid``. You must also
specify a path to the monitor keyring::
specify a path to the monitor keyring:
.. prompt:: bash $
sudo ceph-mon -i {mon-id} --mkfs --monmap {tmp}/{map-filename} --keyring {tmp}/{key-filename}
sudo ceph-mon -i {mon-id} --mkfs --monmap {tmp}/{map-filename} --keyring {tmp}/{key-filename}
#. Start the new monitor and it will automatically join the cluster.
The daemon needs to know which address to bind to, via either the
``--public-addr {ip}`` or ``--public-network {network}`` argument.
For example::
For example:
.. prompt:: bash $
ceph-mon -i {mon-id} --public-addr {ip:port}
ceph-mon -i {mon-id} --public-addr {ip:port}
.. _removing-monitors:
@ -154,13 +166,17 @@ procedure results in only two monitor daemons, you may add or remove another
monitor until you have a number of ``ceph-mon`` daemons that can achieve a
quorum.
#. Stop the monitor. ::
#. Stop the monitor:
service ceph -a stop mon.{mon-id}
.. prompt:: bash $
service ceph -a stop mon.{mon-id}
#. Remove the monitor from the cluster. ::
#. Remove the monitor from the cluster:
ceph mon remove {mon-id}
.. prompt:: bash $
ceph mon remove {mon-id}
#. Remove the monitor entry from ``ceph.conf``.
@ -174,38 +190,61 @@ cluster, for example a cluster where the monitors cannot form a
quorum.
#. Stop all ``ceph-mon`` daemons on all monitor hosts. ::
#. Stop all ``ceph-mon`` daemons on all monitor hosts:
ssh {mon-host}
systemctl stop ceph-mon.target
# and repeat for all mons
.. prompt:: bash $
#. Identify a surviving monitor and log in to that host. ::
ssh {mon-host}
systemctl stop ceph-mon.target
ssh {mon-host}
Repeat for all monitor hosts.
#. Extract a copy of the monmap file. ::
#. Identify a surviving monitor and log in to that host:
ceph-mon -i {mon-id} --extract-monmap {map-path}
# in most cases, that's
ceph-mon -i `hostname` --extract-monmap /tmp/monmap
.. prompt:: bash $
ssh {mon-host}
#. Extract a copy of the monmap file:
.. prompt:: bash $
ceph-mon -i {mon-id} --extract-monmap {map-path}
In most cases, this command will be:
.. prompt:: bash $
ceph-mon -i `hostname` --extract-monmap /tmp/monmap
#. Remove the non-surviving or problematic monitors. For example, if
you have three monitors, ``mon.a``, ``mon.b``, and ``mon.c``, where
only ``mon.a`` will survive, follow the example below::
only ``mon.a`` will survive, follow the example below:
monmaptool {map-path} --rm {mon-id}
# for example,
monmaptool /tmp/monmap --rm b
monmaptool /tmp/monmap --rm c
.. prompt:: bash $
monmaptool {map-path} --rm {mon-id}
For example,
.. prompt:: bash $
monmaptool /tmp/monmap --rm b
monmaptool /tmp/monmap --rm c
#. Inject the surviving map with the removed monitors into the
surviving monitor(s). For example, to inject a map into monitor
``mon.a``, follow the example below::
``mon.a``, follow the example below:
ceph-mon -i {mon-id} --inject-monmap {map-path}
# for example,
ceph-mon -i a --inject-monmap /tmp/monmap
.. prompt:: bash $
ceph-mon -i {mon-id} --inject-monmap {map-path}
For example:
.. prompt:: bash $
ceph-mon -i a --inject-monmap /tmp/monmap
#. Start only the surviving monitors.
@ -316,14 +355,20 @@ networks are unable to communicate. Use the following procedure:
#. Retrieve the monitor map, where ``{tmp}`` is the path to
the retrieved monitor map, and ``{filename}`` is the name of the file
containing the retrieved monitor map. ::
containing the retrieved monitor map:
ceph mon getmap -o {tmp}/{filename}
.. prompt:: bash $
#. The following example demonstrates the contents of the monmap. ::
ceph mon getmap -o {tmp}/{filename}
#. The following example demonstrates the contents of the monmap:
.. prompt:: bash $
monmaptool --print {tmp}/{filename}
::
$ monmaptool --print {tmp}/{filename}
monmaptool: monmap file {tmp}/{filename}
epoch 1
fsid 224e376d-c5fe-4504-96bb-ea6332a19e61
@ -333,27 +378,41 @@ networks are unable to communicate. Use the following procedure:
1: 10.0.0.2:6789/0 mon.b
2: 10.0.0.3:6789/0 mon.c
#. Remove the existing monitors. ::
#. Remove the existing monitors:
$ monmaptool --rm a --rm b --rm c {tmp}/{filename}
.. prompt:: bash $
monmaptool --rm a --rm b --rm c {tmp}/{filename}
::
monmaptool: monmap file {tmp}/{filename}
monmaptool: removing a
monmaptool: removing b
monmaptool: removing c
monmaptool: writing epoch 1 to {tmp}/{filename} (0 monitors)
#. Add the new monitor locations. ::
#. Add the new monitor locations:
$ monmaptool --add a 10.1.0.1:6789 --add b 10.1.0.2:6789 --add c 10.1.0.3:6789 {tmp}/{filename}
.. prompt:: bash $
monmaptool --add a 10.1.0.1:6789 --add b 10.1.0.2:6789 --add c 10.1.0.3:6789 {tmp}/{filename}
::
monmaptool: monmap file {tmp}/{filename}
monmaptool: writing epoch 1 to {tmp}/{filename} (3 monitors)
monmaptool: monmap file {tmp}/{filename}
monmaptool: writing epoch 1 to {tmp}/{filename} (3 monitors)
#. Check new contents. ::
#. Check new contents:
$ monmaptool --print {tmp}/{filename}
.. prompt:: bash $
monmaptool --print {tmp}/{filename}
::
monmaptool: monmap file {tmp}/{filename}
epoch 1
fsid 224e376d-c5fe-4504-96bb-ea6332a19e61
@ -370,9 +429,11 @@ monitors, and inject the modified monmap into each new monitor.
#. First, make sure to stop all your monitors. Injection must be done while
the daemon is not running.
#. Inject the monmap. ::
#. Inject the monmap:
ceph-mon -i {mon-id} --inject-monmap {tmp}/{filename}
.. prompt:: bash $
ceph-mon -i {mon-id} --inject-monmap {tmp}/{filename}
#. Restart the monitors.

View File

@ -71,9 +71,11 @@ weight).
#. Create the OSD. If no UUID is given, it will be set automatically when the
OSD starts up. The following command will output the OSD number, which you
will need for subsequent steps. ::
will need for subsequent steps:
ceph osd create [{uuid} [{id}]]
.. prompt:: bash $
ceph osd create [{uuid} [{id}]]
If the optional parameter {id} is given it will be used as the OSD id.
Note, in this case the command may fail if the number is already in use.
@ -84,33 +86,38 @@ weight).
clusters are large. If {id} is not specified, the smallest available is
used.
#. Create the default directory on your new OSD. ::
#. Create the default directory on your new OSD:
ssh {new-osd-host}
sudo mkdir /var/lib/ceph/osd/ceph-{osd-number}
.. prompt:: bash $
ssh {new-osd-host}
sudo mkdir /var/lib/ceph/osd/ceph-{osd-number}
#. If the OSD is for a drive other than the OS drive, prepare it
for use with Ceph, and mount it to the directory you just created::
for use with Ceph, and mount it to the directory you just created:
ssh {new-osd-host}
sudo mkfs -t {fstype} /dev/{drive}
sudo mount -o user_xattr /dev/{hdd} /var/lib/ceph/osd/ceph-{osd-number}
.. prompt:: bash $
ssh {new-osd-host}
sudo mkfs -t {fstype} /dev/{drive}
sudo mount -o user_xattr /dev/{hdd} /var/lib/ceph/osd/ceph-{osd-number}
#. Initialize the OSD data directory. ::
#. Initialize the OSD data directory:
ssh {new-osd-host}
ceph-osd -i {osd-num} --mkfs --mkkey
.. prompt:: bash $
ssh {new-osd-host}
ceph-osd -i {osd-num} --mkfs --mkkey
The directory must be empty before you can run ``ceph-osd``.
#. Register the OSD authentication key. The value of ``ceph`` for
``ceph-{osd-num}`` in the path is the ``$cluster-$id``. If your
cluster name differs from ``ceph``, use your cluster name instead.::
cluster name differs from ``ceph``, use your cluster name instead:
ceph auth add osd.{osd-num} osd 'allow *' mon 'allow rwx' -i /var/lib/ceph/osd/ceph-{osd-num}/keyring
.. prompt:: bash $
ceph auth add osd.{osd-num} osd 'allow *' mon 'allow rwx' -i /var/lib/ceph/osd/ceph-{osd-num}/keyring
#. Add the OSD to the CRUSH map so that the OSD can begin receiving data. The
``ceph osd crush add`` command allows you to add OSDs to the CRUSH hierarchy
@ -120,9 +127,11 @@ weight).
you specify only the root bucket, the command will attach the OSD directly
to the root, but CRUSH rules expect OSDs to be inside of hosts.
Execute the following::
Execute the following:
ceph osd crush add {id-or-name} {weight} [{bucket-type}={bucket-name} ...]
.. prompt:: bash $
ceph osd crush add {id-or-name} {weight} [{bucket-type}={bucket-name} ...]
You may also decompile the CRUSH map, add the OSD to the device list, add the
host as a bucket (if it's not already in the CRUSH map), add the device as an
@ -135,36 +144,51 @@ weight).
Replacing an OSD
----------------
.. note:: If the instructions in this section do not work for you, try the
instructions in the cephadm documentation: :ref:`cephadm-replacing-an-osd`.
When disks fail, or if an administrator wants to reprovision OSDs with a new
backend, for instance, for switching from FileStore to BlueStore, OSDs need to
be replaced. Unlike `Removing the OSD`_, replaced OSD's id and CRUSH map entry
need to be keep intact after the OSD is destroyed for replacement.
#. Make sure it is safe to destroy the OSD::
#. Make sure it is safe to destroy the OSD:
while ! ceph osd safe-to-destroy osd.{id} ; do sleep 10 ; done
.. prompt:: bash $
#. Destroy the OSD first::
while ! ceph osd safe-to-destroy osd.{id} ; do sleep 10 ; done
ceph osd destroy {id} --yes-i-really-mean-it
#. Destroy the OSD first:
.. prompt:: bash $
ceph osd destroy {id} --yes-i-really-mean-it
#. Zap a disk for the new OSD, if the disk was used before for other purposes.
It's not necessary for a new disk::
It's not necessary for a new disk:
ceph-volume lvm zap /dev/sdX
.. prompt:: bash $
#. Prepare the disk for replacement by using the previously destroyed OSD id::
ceph-volume lvm zap /dev/sdX
ceph-volume lvm prepare --osd-id {id} --data /dev/sdX
#. Prepare the disk for replacement by using the previously destroyed OSD id:
#. And activate the OSD::
.. prompt:: bash $
ceph-volume lvm activate {id} {fsid}
ceph-volume lvm prepare --osd-id {id} --data /dev/sdX
#. And activate the OSD:
.. prompt:: bash $
ceph-volume lvm activate {id} {fsid}
Alternatively, instead of preparing and activating, the device can be recreated
in one call, like::
in one call, like:
ceph-volume lvm create --osd-id {id} --data /dev/sdX
.. prompt:: bash $
ceph-volume lvm create --osd-id {id} --data /dev/sdX
Starting the OSD
@ -174,15 +198,11 @@ After you add an OSD to Ceph, the OSD is in your configuration. However,
it is not yet running. The OSD is ``down`` and ``in``. You must start
your new OSD before it can begin receiving data. You may use
``service ceph`` from your admin host or start the OSD from its host
machine.
machine:
For Ubuntu Trusty use Upstart. ::
.. prompt:: bash $
sudo start ceph-osd id={osd-num}
For all other distros use systemd. ::
sudo systemctl start ceph-osd@{osd-num}
sudo systemctl start ceph-osd@{osd-num}
Once you start your OSD, it is ``up`` and ``in``.
@ -193,15 +213,16 @@ Observe the Data Migration
Once you have added your new OSD to the CRUSH map, Ceph will begin rebalancing
the server by migrating placement groups to your new OSD. You can observe this
process with the `ceph`_ tool. ::
process with the `ceph`_ tool. :
ceph -w
.. prompt:: bash $
ceph -w
You should see the placement group states change from ``active+clean`` to
``active, some degraded objects``, and finally ``active+clean`` when migration
completes. (Control-c to exit.)
.. _Add/Move an OSD: ../crush-map#addosd
.. _ceph: ../monitoring
@ -228,9 +249,11 @@ Take the OSD out of the Cluster
Before you remove an OSD, it is usually ``up`` and ``in``. You need to take it
out of the cluster so that Ceph can begin rebalancing and copying its data to
other OSDs. ::
other OSDs. :
ceph osd out {osd-num}
.. prompt:: bash $
ceph osd out {osd-num}
Observe the Data Migration
@ -238,9 +261,11 @@ Observe the Data Migration
Once you have taken your OSD ``out`` of the cluster, Ceph will begin
rebalancing the cluster by migrating placement groups out of the OSD you
removed. You can observe this process with the `ceph`_ tool. ::
removed. You can observe this process with the `ceph`_ tool. :
ceph -w
.. prompt:: bash $
ceph -w
You should see the placement group states change from ``active+clean`` to
``active, some degraded objects``, and finally ``active+clean`` when migration
@ -252,12 +277,16 @@ completes. (Control-c to exit.)
``active+remapped`` state. If you are in this case, you should mark
the OSD ``in`` with:
``ceph osd in {osd-num}``
.. prompt:: bash $
ceph osd in {osd-num}
to come back to the initial state and then, instead of marking ``out``
the OSD, set its weight to 0 with:
``ceph osd crush reweight osd.{osd-num} 0``
.. prompt:: bash $
ceph osd crush reweight osd.{osd-num} 0
After that, you can observe the data migration which should come to its
end. The difference between marking ``out`` the OSD and reweighting it
@ -273,10 +302,12 @@ Stopping the OSD
After you take an OSD out of the cluster, it may still be running.
That is, the OSD may be ``up`` and ``out``. You must stop
your OSD before you remove it from the configuration. ::
your OSD before you remove it from the configuration:
ssh {osd-host}
sudo systemctl stop ceph-osd@{osd-num}
.. prompt:: bash $
ssh {osd-host}
sudo systemctl stop ceph-osd@{osd-num}
Once you stop your OSD, it is ``down``.
@ -292,50 +323,64 @@ OSD for each drive by repeating this procedure.
#. Let the cluster forget the OSD first. This step removes the OSD from the CRUSH
map, removes its authentication key. And it is removed from the OSD map as
well. Please note the :ref:`purge subcommand <ceph-admin-osd>` is introduced in Luminous, for older
versions, please see below ::
versions, please see below:
ceph osd purge {id} --yes-i-really-mean-it
.. prompt:: bash $
ceph osd purge {id} --yes-i-really-mean-it
#. Navigate to the host where you keep the master copy of the cluster's
``ceph.conf`` file. ::
``ceph.conf`` file:
ssh {admin-host}
cd /etc/ceph
vim ceph.conf
.. prompt:: bash $
#. Remove the OSD entry from your ``ceph.conf`` file (if it exists). ::
ssh {admin-host}
cd /etc/ceph
vim ceph.conf
#. Remove the OSD entry from your ``ceph.conf`` file (if it exists)::
[osd.1]
host = {hostname}
#. From the host where you keep the master copy of the cluster's ``ceph.conf`` file,
copy the updated ``ceph.conf`` file to the ``/etc/ceph`` directory of other
hosts in your cluster.
#. From the host where you keep the master copy of the cluster's ``ceph.conf``
file, copy the updated ``ceph.conf`` file to the ``/etc/ceph`` directory of
other hosts in your cluster.
If your Ceph cluster is older than Luminous, instead of using ``ceph osd purge``,
you need to perform this step manually:
If your Ceph cluster is older than Luminous, instead of using ``ceph osd
purge``, you need to perform this step manually:
#. Remove the OSD from the CRUSH map so that it no longer receives data. You may
also decompile the CRUSH map, remove the OSD from the device list, remove the
device as an item in the host bucket or remove the host bucket (if it's in the
CRUSH map and you intend to remove the host), recompile the map and set it.
See `Remove an OSD`_ for details. ::
See `Remove an OSD`_ for details:
ceph osd crush remove {name}
.. prompt:: bash $
#. Remove the OSD authentication key. ::
ceph osd crush remove {name}
ceph auth del osd.{osd-num}
#. Remove the OSD authentication key:
The value of ``ceph`` for ``ceph-{osd-num}`` in the path is the ``$cluster-$id``.
If your cluster name differs from ``ceph``, use your cluster name instead.
.. prompt:: bash $
#. Remove the OSD. ::
ceph auth del osd.{osd-num}
ceph osd rm {osd-num}
#for example
ceph osd rm 1
The value of ``ceph`` for ``ceph-{osd-num}`` in the path is the
``$cluster-$id``. If your cluster name differs from ``ceph``, use your
cluster name instead.
#. Remove the OSD:
.. prompt:: bash $
ceph osd rm {osd-num}
for example:
.. prompt:: bash $
ceph osd rm 1
.. _Remove an OSD: ../crush-map#removeosd

View File

@ -1,4 +1,3 @@
.. _balancer:
Balancer
@ -11,9 +10,11 @@ supervised fashion.
Status
------
The current status of the balancer can be checked at any time with::
The current status of the balancer can be checked at any time with:
ceph balancer status
.. prompt:: bash $
ceph balancer status
Automatic balancing
@ -21,9 +22,11 @@ Automatic balancing
The automatic balancing feature is enabled by default in ``upmap``
mode. Please refer to :ref:`upmap` for more details. The balancer can be
turned off with::
turned off with:
ceph balancer off
.. prompt:: bash $
ceph balancer off
The balancer mode can be changed to ``crush-compat`` mode, which is
backward compatible with older clients, and will make small changes to
@ -40,37 +43,51 @@ healed itself).
When the cluster is healthy, the balancer will throttle its changes
such that the percentage of PGs that are misplaced (i.e., that need to
be moved) is below a threshold of (by default) 5%. The
``target_max_misplaced_ratio`` threshold can be adjusted with::
``target_max_misplaced_ratio`` threshold can be adjusted with:
ceph config set mgr target_max_misplaced_ratio .07 # 7%
.. prompt:: bash $
Set the number of seconds to sleep in between runs of the automatic balancer::
ceph config set mgr target_max_misplaced_ratio .07 # 7%
ceph config set mgr mgr/balancer/sleep_interval 60
Set the number of seconds to sleep in between runs of the automatic balancer:
Set the time of day to begin automatic balancing in HHMM format::
.. prompt:: bash $
ceph config set mgr mgr/balancer/begin_time 0000
ceph config set mgr mgr/balancer/sleep_interval 60
Set the time of day to finish automatic balancing in HHMM format::
Set the time of day to begin automatic balancing in HHMM format:
ceph config set mgr mgr/balancer/end_time 2400
.. prompt:: bash $
ceph config set mgr mgr/balancer/begin_time 0000
Set the time of day to finish automatic balancing in HHMM format:
.. prompt:: bash $
ceph config set mgr mgr/balancer/end_time 2359
Restrict automatic balancing to this day of the week or later.
Uses the same conventions as crontab, 0 or 7 is Sunday, 1 is Monday, and so on::
Uses the same conventions as crontab, 0 is Sunday, 1 is Monday, and so on:
ceph config set mgr mgr/balancer/begin_weekday 0
.. prompt:: bash $
ceph config set mgr mgr/balancer/begin_weekday 0
Restrict automatic balancing to this day of the week or earlier.
Uses the same conventions as crontab, 0 or 7 is Sunday, 1 is Monday, and so on::
Uses the same conventions as crontab, 0 is Sunday, 1 is Monday, and so on:
ceph config set mgr mgr/balancer/end_weekday 7
.. prompt:: bash $
ceph config set mgr mgr/balancer/end_weekday 6
Pool IDs to which the automatic balancing will be limited.
The default for this is an empty string, meaning all pools will be balanced.
The numeric pool IDs can be gotten with the :command:`ceph osd pool ls detail` command::
The numeric pool IDs can be gotten with the :command:`ceph osd pool ls detail` command:
ceph config set mgr mgr/balancer/pool_ids 1,2,3
.. prompt:: bash $
ceph config set mgr mgr/balancer/pool_ids 1,2,3
Modes
@ -112,9 +129,11 @@ There are currently two supported balancer modes:
Note that using upmap requires that all clients be Luminous or newer.
The default mode is ``upmap``. The mode can be adjusted with::
The default mode is ``upmap``. The mode can be adjusted with:
ceph balancer mode crush-compat
.. prompt:: bash $
ceph balancer mode crush-compat
Supervised optimization
-----------------------
@ -125,43 +144,63 @@ The balancer operation is broken into a few distinct phases:
#. evaluating the quality of the data distribution, either for the current PG distribution, or the PG distribution that would result after executing a *plan*
#. executing the *plan*
To evaluate and score the current distribution::
To evaluate and score the current distribution:
ceph balancer eval
.. prompt:: bash $
You can also evaluate the distribution for a single pool with::
ceph balancer eval
ceph balancer eval <pool-name>
You can also evaluate the distribution for a single pool with:
Greater detail for the evaluation can be seen with::
.. prompt:: bash $
ceph balancer eval-verbose ...
ceph balancer eval <pool-name>
Greater detail for the evaluation can be seen with:
.. prompt:: bash $
ceph balancer eval-verbose ...
The balancer can generate a plan, using the currently configured mode, with::
The balancer can generate a plan, using the currently configured mode, with:
ceph balancer optimize <plan-name>
.. prompt:: bash $
The name is provided by the user and can be any useful identifying string. The contents of a plan can be seen with::
ceph balancer optimize <plan-name>
ceph balancer show <plan-name>
The name is provided by the user and can be any useful identifying string. The contents of a plan can be seen with:
All plans can be shown with::
.. prompt:: bash $
ceph balancer ls
ceph balancer show <plan-name>
Old plans can be discarded with::
All plans can be shown with:
ceph balancer rm <plan-name>
.. prompt:: bash $
Currently recorded plans are shown as part of the status command::
ceph balancer ls
ceph balancer status
Old plans can be discarded with:
The quality of the distribution that would result after executing a plan can be calculated with::
.. prompt:: bash $
ceph balancer eval <plan-name>
ceph balancer rm <plan-name>
Assuming the plan is expected to improve the distribution (i.e., it has a lower score than the current cluster state), the user can execute that plan with::
Currently recorded plans are shown as part of the status command:
ceph balancer execute <plan-name>
.. prompt:: bash $
ceph balancer status
The quality of the distribution that would result after executing a plan can be calculated with:
.. prompt:: bash $
ceph balancer eval <plan-name>
Assuming the plan is expected to improve the distribution (i.e., it has a lower score than the current cluster state), the user can execute that plan with:
.. prompt:: bash $
ceph balancer execute <plan-name>

View File

@ -41,50 +41,70 @@ more data migration than should be necessary, so it is not optimal.
ID=<osd-id-number>
DEVICE=<disk-device>
You can tell whether a given OSD is FileStore or BlueStore with::
You can tell whether a given OSD is FileStore or BlueStore with:
ceph osd metadata $ID | grep osd_objectstore
.. prompt:: bash $
You can get a current count of filestore vs bluestore with::
ceph osd metadata $ID | grep osd_objectstore
ceph osd count-metadata osd_objectstore
You can get a current count of filestore vs bluestore with:
#. Mark the filestore OSD out::
.. prompt:: bash $
ceph osd out $ID
ceph osd count-metadata osd_objectstore
#. Wait for the data to migrate off the OSD in question::
#. Mark the filestore OSD out:
while ! ceph osd safe-to-destroy $ID ; do sleep 60 ; done
.. prompt:: bash $
#. Stop the OSD::
ceph osd out $ID
systemctl kill ceph-osd@$ID
#. Wait for the data to migrate off the OSD in question:
#. Make note of which device this OSD is using::
.. prompt:: bash $
mount | grep /var/lib/ceph/osd/ceph-$ID
while ! ceph osd safe-to-destroy $ID ; do sleep 60 ; done
#. Unmount the OSD::
#. Stop the OSD:
umount /var/lib/ceph/osd/ceph-$ID
.. prompt:: bash $
systemctl kill ceph-osd@$ID
#. Make note of which device this OSD is using:
.. prompt:: bash $
mount | grep /var/lib/ceph/osd/ceph-$ID
#. Unmount the OSD:
.. prompt:: bash $
umount /var/lib/ceph/osd/ceph-$ID
#. Destroy the OSD data. Be *EXTREMELY CAREFUL* as this will destroy
the contents of the device; be certain the data on the device is
not needed (i.e., that the cluster is healthy) before proceeding. ::
not needed (i.e., that the cluster is healthy) before proceeding:
ceph-volume lvm zap $DEVICE
.. prompt:: bash $
ceph-volume lvm zap $DEVICE
#. Tell the cluster the OSD has been destroyed (and a new OSD can be
reprovisioned with the same ID)::
reprovisioned with the same ID):
ceph osd destroy $ID --yes-i-really-mean-it
.. prompt:: bash $
ceph osd destroy $ID --yes-i-really-mean-it
#. Reprovision a BlueStore OSD in its place with the same OSD ID.
This requires you do identify which device to wipe based on what you saw
mounted above. BE CAREFUL! ::
mounted above. BE CAREFUL! :
ceph-volume lvm create --bluestore --data $DEVICE --osd-id $ID
.. prompt:: bash $
ceph-volume lvm create --bluestore --data $DEVICE --osd-id $ID
#. Repeat.
@ -127,9 +147,11 @@ doesn't strictly matter). ::
NEWHOST=<empty-host-name>
Add the host to the CRUSH hierarchy, but do not attach it to the root::
Add the host to the CRUSH hierarchy, but do not attach it to the root:
ceph osd crush add-bucket $NEWHOST host
.. prompt:: bash $
ceph osd crush add-bucket $NEWHOST host
Make sure the ceph packages are installed.
@ -142,14 +164,22 @@ space on that host so that all of its data can be migrated off,
then you can instead do::
OLDHOST=<existing-cluster-host-to-offload>
ceph osd crush unlink $OLDHOST default
.. prompt:: bash $
ceph osd crush unlink $OLDHOST default
where "default" is the immediate ancestor in the CRUSH map. (For
smaller clusters with unmodified configurations this will normally
be "default", but it might also be a rack name.) You should now
see the host at the top of the OSD tree output with no parent::
see the host at the top of the OSD tree output with no parent:
.. prompt:: bash $
bin/ceph osd tree
::
$ bin/ceph osd tree
ID CLASS WEIGHT TYPE NAME STATUS REWEIGHT PRI-AFF
-5 0 host oldhost
10 ssd 1.00000 osd.10 up 1.00000 1.00000
@ -172,13 +202,17 @@ Migration process
If you're using a new host, start at step #1. For an existing host,
jump to step #5 below.
#. Provision new BlueStore OSDs for all devices::
#. Provision new BlueStore OSDs for all devices:
ceph-volume lvm create --bluestore --data /dev/$DEVICE
.. prompt:: bash $
#. Verify OSDs join the cluster with::
ceph-volume lvm create --bluestore --data /dev/$DEVICE
ceph osd tree
#. Verify OSDs join the cluster with:
.. prompt:: bash $
ceph osd tree
You should see the new host ``$NEWHOST`` with all of the OSDs beneath
it, but the host should *not* be nested beneath any other node in
@ -198,13 +232,17 @@ jump to step #5 below.
2 ssd 1.00000 osd.2 up 1.00000 1.00000
...
#. Identify the first target host to convert ::
#. Identify the first target host to convert :
OLDHOST=<existing-cluster-host-to-convert>
.. prompt:: bash $
#. Swap the new host into the old host's position in the cluster::
OLDHOST=<existing-cluster-host-to-convert>
ceph osd crush swap-bucket $NEWHOST $OLDHOST
#. Swap the new host into the old host's position in the cluster:
.. prompt:: bash $
ceph osd crush swap-bucket $NEWHOST $OLDHOST
At this point all data on ``$OLDHOST`` will start migrating to OSDs
on ``$NEWHOST``. If there is a difference in the total capacity of
@ -212,26 +250,34 @@ jump to step #5 below.
other nodes in the cluster, but as long as the hosts are similarly
sized this will be a relatively small amount of data.
#. Wait for data migration to complete::
#. Wait for data migration to complete:
while ! ceph osd safe-to-destroy $(ceph osd ls-tree $OLDHOST); do sleep 60 ; done
.. prompt:: bash $
#. Stop all old OSDs on the now-empty ``$OLDHOST``::
while ! ceph osd safe-to-destroy $(ceph osd ls-tree $OLDHOST); do sleep 60 ; done
ssh $OLDHOST
#. Stop all old OSDs on the now-empty ``$OLDHOST``:
.. prompt:: bash $
ssh $OLDHOST
systemctl kill ceph-osd.target
umount /var/lib/ceph/osd/ceph-*
#. Destroy and purge the old OSDs::
#. Destroy and purge the old OSDs:
for osd in `ceph osd ls-tree $OLDHOST`; do
.. prompt:: bash $
for osd in `ceph osd ls-tree $OLDHOST`; do
ceph osd purge $osd --yes-i-really-mean-it
done
#. Wipe the old OSD devices. This requires you do identify which
devices are to be wiped manually (BE CAREFUL!). For each device,::
devices are to be wiped manually (BE CAREFUL!). For each device:
ceph-volume lvm zap $DEVICE
.. prompt:: bash $
ceph-volume lvm zap $DEVICE
#. Use the now-empty host as the new host, and repeat::

View File

@ -45,16 +45,22 @@ and the backing storage tier automatically. However, admins have the ability to
configure how this migration takes place by setting the ``cache-mode``. There are
two main scenarios:
- **writeback** mode: When admins configure tiers with ``writeback`` mode, Ceph
clients write data to the cache tier and receive an ACK from the cache tier.
In time, the data written to the cache tier migrates to the storage tier
and gets flushed from the cache tier. Conceptually, the cache tier is
overlaid "in front" of the backing storage tier. When a Ceph client needs
data that resides in the storage tier, the cache tiering agent migrates the
data to the cache tier on read, then it is sent to the Ceph client.
Thereafter, the Ceph client can perform I/O using the cache tier, until the
data becomes inactive. This is ideal for mutable data (e.g., photo/video
editing, transactional data, etc.).
- **writeback** mode: If the base tier and the cache tier are configured in
``writeback`` mode, Ceph clients receive an ACK from the base tier every time
they write data to it. Then the cache tiering agent determines whether
``osd_tier_default_cache_min_write_recency_for_promote`` has been set. If it
has been set and the data has been written more than a specified number of
times per interval, the data is promoted to the cache tier.
When Ceph clients need access to data stored in the base tier, the cache
tiering agent reads the data from the base tier and returns it to the client.
While data is being read from the base tier, the cache tiering agent consults
the value of ``osd_tier_default_cache_min_read_recency_for_promote`` and
decides whether to promote that data from the base tier to the cache tier.
When data has been promoted from the base tier to the cache tier, the Ceph
client is able to perform I/O operations on it using the cache tier. This is
well-suited for mutable data (for example, photo/video editing, transactional
data).
- **readproxy** mode: This mode will use any objects that already
exist in the cache tier, but if an object is not present in the
@ -199,62 +205,82 @@ Creating a Cache Tier
=====================
Setting up a cache tier involves associating a backing storage pool with
a cache pool ::
a cache pool:
ceph osd tier add {storagepool} {cachepool}
.. prompt:: bash $
For example ::
ceph osd tier add {storagepool} {cachepool}
ceph osd tier add cold-storage hot-storage
For example:
To set the cache mode, execute the following::
.. prompt:: bash $
ceph osd tier cache-mode {cachepool} {cache-mode}
ceph osd tier add cold-storage hot-storage
For example::
To set the cache mode, execute the following:
ceph osd tier cache-mode hot-storage writeback
.. prompt:: bash $
ceph osd tier cache-mode {cachepool} {cache-mode}
For example:
.. prompt:: bash $
ceph osd tier cache-mode hot-storage writeback
The cache tiers overlay the backing storage tier, so they require one
additional step: you must direct all client traffic from the storage pool to
the cache pool. To direct client traffic directly to the cache pool, execute
the following::
the following:
ceph osd tier set-overlay {storagepool} {cachepool}
.. prompt:: bash $
For example::
ceph osd tier set-overlay {storagepool} {cachepool}
ceph osd tier set-overlay cold-storage hot-storage
For example:
.. prompt:: bash $
ceph osd tier set-overlay cold-storage hot-storage
Configuring a Cache Tier
========================
Cache tiers have several configuration options. You may set
cache tier configuration options with the following usage::
cache tier configuration options with the following usage:
ceph osd pool set {cachepool} {key} {value}
.. prompt:: bash $
ceph osd pool set {cachepool} {key} {value}
See `Pools - Set Pool Values`_ for details.
Target Size and Type
--------------------
Ceph's production cache tiers use a `Bloom Filter`_ for the ``hit_set_type``::
Ceph's production cache tiers use a `Bloom Filter`_ for the ``hit_set_type``:
ceph osd pool set {cachepool} hit_set_type bloom
.. prompt:: bash $
For example::
ceph osd pool set {cachepool} hit_set_type bloom
ceph osd pool set hot-storage hit_set_type bloom
For example:
.. prompt:: bash $
ceph osd pool set hot-storage hit_set_type bloom
The ``hit_set_count`` and ``hit_set_period`` define how many such HitSets to
store, and how much time each HitSet should cover. ::
store, and how much time each HitSet should cover:
ceph osd pool set {cachepool} hit_set_count 12
ceph osd pool set {cachepool} hit_set_period 14400
ceph osd pool set {cachepool} target_max_bytes 1000000000000
.. prompt:: bash $
ceph osd pool set {cachepool} hit_set_count 12
ceph osd pool set {cachepool} hit_set_period 14400
ceph osd pool set {cachepool} target_max_bytes 1000000000000
.. note:: A larger ``hit_set_count`` results in more RAM consumed by
the ``ceph-osd`` process.
@ -273,10 +299,12 @@ number of archive HitSets are checked. The object is promoted if the object is
found in any of the most recent ``min_read_recency_for_promote`` HitSets.
A similar parameter can be set for the write operation, which is
``min_write_recency_for_promote``. ::
``min_write_recency_for_promote``:
ceph osd pool set {cachepool} min_read_recency_for_promote 2
ceph osd pool set {cachepool} min_write_recency_for_promote 2
.. prompt:: bash $
ceph osd pool set {cachepool} min_read_recency_for_promote 2
ceph osd pool set {cachepool} min_write_recency_for_promote 2
.. note:: The longer the period and the higher the
``min_read_recency_for_promote`` and
@ -303,22 +331,29 @@ Absolute Sizing
The cache tiering agent can flush or evict objects based upon the total number
of bytes or the total number of objects. To specify a maximum number of bytes,
execute the following::
execute the following:
ceph osd pool set {cachepool} target_max_bytes {#bytes}
.. prompt:: bash $
For example, to flush or evict at 1 TB, execute the following::
ceph osd pool set {cachepool} target_max_bytes {#bytes}
ceph osd pool set hot-storage target_max_bytes 1099511627776
For example, to flush or evict at 1 TB, execute the following:
.. prompt:: bash $
To specify the maximum number of objects, execute the following::
ceph osd pool set hot-storage target_max_bytes 1099511627776
ceph osd pool set {cachepool} target_max_objects {#objects}
To specify the maximum number of objects, execute the following:
For example, to flush or evict at 1M objects, execute the following::
.. prompt:: bash $
ceph osd pool set hot-storage target_max_objects 1000000
ceph osd pool set {cachepool} target_max_objects {#objects}
For example, to flush or evict at 1M objects, execute the following:
.. prompt:: bash $
ceph osd pool set hot-storage target_max_objects 1000000
.. note:: Ceph is not able to determine the size of a cache pool automatically, so
the configuration on the absolute size is required here, otherwise the
@ -335,59 +370,79 @@ The cache tiering agent can flush or evict objects relative to the size of the
cache pool(specified by ``target_max_bytes`` / ``target_max_objects`` in
`Absolute sizing`_). When the cache pool consists of a certain percentage of
modified (or dirty) objects, the cache tiering agent will flush them to the
storage pool. To set the ``cache_target_dirty_ratio``, execute the following::
storage pool. To set the ``cache_target_dirty_ratio``, execute the following:
ceph osd pool set {cachepool} cache_target_dirty_ratio {0.0..1.0}
.. prompt:: bash $
ceph osd pool set {cachepool} cache_target_dirty_ratio {0.0..1.0}
For example, setting the value to ``0.4`` will begin flushing modified
(dirty) objects when they reach 40% of the cache pool's capacity::
(dirty) objects when they reach 40% of the cache pool's capacity:
ceph osd pool set hot-storage cache_target_dirty_ratio 0.4
.. prompt:: bash $
ceph osd pool set hot-storage cache_target_dirty_ratio 0.4
When the dirty objects reaches a certain percentage of its capacity, flush dirty
objects with a higher speed. To set the ``cache_target_dirty_high_ratio``::
objects with a higher speed. To set the ``cache_target_dirty_high_ratio``:
ceph osd pool set {cachepool} cache_target_dirty_high_ratio {0.0..1.0}
.. prompt:: bash $
For example, setting the value to ``0.6`` will begin aggressively flush dirty objects
when they reach 60% of the cache pool's capacity. obviously, we'd better set the value
between dirty_ratio and full_ratio::
ceph osd pool set {cachepool} cache_target_dirty_high_ratio {0.0..1.0}
ceph osd pool set hot-storage cache_target_dirty_high_ratio 0.6
For example, setting the value to ``0.6`` will begin aggressively flush dirty
objects when they reach 60% of the cache pool's capacity. obviously, we'd
better set the value between dirty_ratio and full_ratio:
.. prompt:: bash $
ceph osd pool set hot-storage cache_target_dirty_high_ratio 0.6
When the cache pool reaches a certain percentage of its capacity, the cache
tiering agent will evict objects to maintain free capacity. To set the
``cache_target_full_ratio``, execute the following::
``cache_target_full_ratio``, execute the following:
ceph osd pool set {cachepool} cache_target_full_ratio {0.0..1.0}
.. prompt:: bash $
ceph osd pool set {cachepool} cache_target_full_ratio {0.0..1.0}
For example, setting the value to ``0.8`` will begin flushing unmodified
(clean) objects when they reach 80% of the cache pool's capacity::
(clean) objects when they reach 80% of the cache pool's capacity:
ceph osd pool set hot-storage cache_target_full_ratio 0.8
.. prompt:: bash $
ceph osd pool set hot-storage cache_target_full_ratio 0.8
Cache Age
---------
You can specify the minimum age of an object before the cache tiering agent
flushes a recently modified (or dirty) object to the backing storage pool::
flushes a recently modified (or dirty) object to the backing storage pool:
ceph osd pool set {cachepool} cache_min_flush_age {#seconds}
.. prompt:: bash $
For example, to flush modified (or dirty) objects after 10 minutes, execute
the following::
ceph osd pool set {cachepool} cache_min_flush_age {#seconds}
ceph osd pool set hot-storage cache_min_flush_age 600
For example, to flush modified (or dirty) objects after 10 minutes, execute the
following:
You can specify the minimum age of an object before it will be evicted from
the cache tier::
.. prompt:: bash $
ceph osd pool {cache-tier} cache_min_evict_age {#seconds}
ceph osd pool set hot-storage cache_min_flush_age 600
For example, to evict objects after 30 minutes, execute the following::
You can specify the minimum age of an object before it will be evicted from the
cache tier:
ceph osd pool set hot-storage cache_min_evict_age 1800
.. prompt:: bash $
ceph osd pool {cache-tier} cache_min_evict_age {#seconds}
For example, to evict objects after 30 minutes, execute the following:
.. prompt:: bash $
ceph osd pool set hot-storage cache_min_evict_age 1800
Removing a Cache Tier
@ -403,22 +458,29 @@ Removing a Read-Only Cache
Since a read-only cache does not have modified data, you can disable
and remove it without losing any recent changes to objects in the cache.
#. Change the cache-mode to ``none`` to disable it. ::
#. Change the cache-mode to ``none`` to disable it.:
ceph osd tier cache-mode {cachepool} none
.. prompt:: bash
For example::
ceph osd tier cache-mode {cachepool} none
ceph osd tier cache-mode hot-storage none
For example:
#. Remove the cache pool from the backing pool. ::
.. prompt:: bash $
ceph osd tier remove {storagepool} {cachepool}
ceph osd tier cache-mode hot-storage none
For example::
#. Remove the cache pool from the backing pool.:
ceph osd tier remove cold-storage hot-storage
.. prompt:: bash $
ceph osd tier remove {storagepool} {cachepool}
For example:
.. prompt:: bash $
ceph osd tier remove cold-storage hot-storage
Removing a Writeback Cache
@ -430,41 +492,57 @@ disable and remove it.
#. Change the cache mode to ``proxy`` so that new and modified objects will
flush to the backing storage pool. ::
flush to the backing storage pool.:
ceph osd tier cache-mode {cachepool} proxy
.. prompt:: bash $
For example::
ceph osd tier cache-mode {cachepool} proxy
ceph osd tier cache-mode hot-storage proxy
For example:
.. prompt:: bash $
ceph osd tier cache-mode hot-storage proxy
#. Ensure that the cache pool has been flushed. This may take a few minutes::
#. Ensure that the cache pool has been flushed. This may take a few minutes:
rados -p {cachepool} ls
.. prompt:: bash $
rados -p {cachepool} ls
If the cache pool still has objects, you can flush them manually.
For example::
For example:
rados -p {cachepool} cache-flush-evict-all
.. prompt:: bash $
rados -p {cachepool} cache-flush-evict-all
#. Remove the overlay so that clients will not direct traffic to the cache. ::
#. Remove the overlay so that clients will not direct traffic to the cache.:
ceph osd tier remove-overlay {storagetier}
.. prompt:: bash $
For example::
ceph osd tier remove-overlay {storagetier}
ceph osd tier remove-overlay cold-storage
For example:
.. prompt:: bash $
ceph osd tier remove-overlay cold-storage
#. Finally, remove the cache tier pool from the backing storage pool. ::
#. Finally, remove the cache tier pool from the backing storage pool.:
ceph osd tier remove {storagepool} {cachepool}
.. prompt:: bash $
For example::
ceph osd tier remove {storagepool} {cachepool}
ceph osd tier remove cold-storage hot-storage
For example:
.. prompt:: bash $
ceph osd tier remove cold-storage hot-storage
.. _Create a Pool: ../pools#create-a-pool

View File

@ -30,18 +30,24 @@ This mode lets you mark monitors as disallowd, in which case they will
participate in the quorum and serve clients, but cannot be elected leader. You
may wish to use this if you have some monitors which are known to be far away
from clients.
You can disallow a leader by running ::
You can disallow a leader by running:
$ ceph mon add disallowed_leader {name}
.. prompt:: bash $
ceph mon add disallowed_leader {name}
You can remove a monitor from the disallowed list, and allow it to become
a leader again, by running ::
a leader again, by running:
$ ceph mon rm disallowed_leader {name}
.. prompt:: bash $
The list of disallowed_leaders is included when you run ::
ceph mon rm disallowed_leader {name}
$ ceph mon dump
The list of disallowed_leaders is included when you run:
.. prompt:: bash $
ceph mon dump
The connectivity Mode
=====================
@ -58,9 +64,11 @@ Examining connectivity scores
=============================
The monitors maintain connection scores even if they aren't in
the connectivity election mode. You can examine the scores a monitor
has by running ::
has by running:
ceph daemon mon.{name} connection scores dump
.. prompt:: bash $
ceph daemon mon.{name} connection scores dump
Scores for individual connections range from 0-1 inclusive, and also
include whether the connection is considered alive or dead (determined by
@ -68,9 +76,11 @@ whether it returned its latest ping within the timeout).
While this would be an unexpected occurrence, if for some reason you experience
problems and troubleshooting makes you think your scores have become invalid,
you can forget history and reset them by running ::
you can forget history and reset them by running:
ceph daemon mon.{name} connection scores reset
.. prompt:: bash $
ceph daemon mon.{name} connection scores reset
While resetting scores has low risk (monitors will still quickly determine
if a connection is alive or dead, and trend back to the previous scores if they

View File

@ -8,38 +8,50 @@
Monitor Commands
================
Monitor commands are issued using the ``ceph`` utility::
Monitor commands are issued using the ``ceph`` utility:
ceph [-m monhost] {command}
.. prompt:: bash $
The command is usually (though not always) of the form::
ceph [-m monhost] {command}
ceph {subsystem} {command}
The command is usually (though not always) of the form:
.. prompt:: bash $
ceph {subsystem} {command}
System Commands
===============
Execute the following to display the current cluster status. ::
Execute the following to display the current cluster status. :
ceph -s
ceph status
.. prompt:: bash $
ceph -s
ceph status
Execute the following to display a running summary of cluster status
and major events. ::
and major events. :
ceph -w
.. prompt:: bash $
ceph -w
Execute the following to show the monitor quorum, including which monitors are
participating and which one is the leader. ::
participating and which one is the leader. :
ceph mon stat
ceph quorum_status
.. prompt:: bash $
ceph mon stat
ceph quorum_status
Execute the following to query the status of a single monitor, including whether
or not it is in the quorum. ::
or not it is in the quorum. :
ceph tell mon.[id] mon_status
.. prompt:: bash $
ceph tell mon.[id] mon_status
where the value of ``[id]`` can be determined, e.g., from ``ceph -s``.
@ -47,21 +59,27 @@ where the value of ``[id]`` can be determined, e.g., from ``ceph -s``.
Authentication Subsystem
========================
To add a keyring for an OSD, execute the following::
To add a keyring for an OSD, execute the following:
ceph auth add {osd} {--in-file|-i} {path-to-osd-keyring}
.. prompt:: bash $
To list the cluster's keys and their capabilities, execute the following::
ceph auth add {osd} {--in-file|-i} {path-to-osd-keyring}
ceph auth ls
To list the cluster's keys and their capabilities, execute the following:
.. prompt:: bash $
ceph auth ls
Placement Group Subsystem
=========================
To display the statistics for all placement groups (PGs), execute the following::
To display the statistics for all placement groups (PGs), execute the following:
ceph pg dump [--format {format}]
.. prompt:: bash $
ceph pg dump [--format {format}]
The valid formats are ``plain`` (default), ``json`` ``json-pretty``, ``xml``, and ``xml-pretty``.
When implementing monitoring and other tools, it is best to use ``json`` format.
@ -70,9 +88,11 @@ less variable from release to release. The ``jq`` utility can be invaluable whe
data from JSON output.
To display the statistics for all placement groups stuck in a specified state,
execute the following::
execute the following:
ceph pg dump_stuck inactive|unclean|stale|undersized|degraded [--format {format}] [-t|--threshold {seconds}]
.. prompt:: bash $
ceph pg dump_stuck inactive|unclean|stale|undersized|degraded [--format {format}] [-t|--threshold {seconds}]
``--format`` may be ``plain`` (default), ``json``, ``json-pretty``, ``xml``, or ``xml-pretty``.
@ -90,9 +110,11 @@ reported to the monitor cluster in a while (configured by
``mon_osd_report_timeout``).
Delete "lost" objects or revert them to their prior state, either a previous version
or delete them if they were just created. ::
or delete them if they were just created. :
ceph pg {pgid} mark_unfound_lost revert|delete
.. prompt:: bash $
ceph pg {pgid} mark_unfound_lost revert|delete
.. _osd-subsystem:
@ -100,105 +122,149 @@ or delete them if they were just created. ::
OSD Subsystem
=============
Query OSD subsystem status. ::
Query OSD subsystem status. :
ceph osd stat
.. prompt:: bash $
ceph osd stat
Write a copy of the most recent OSD map to a file. See
:ref:`osdmaptool <osdmaptool>`. ::
:ref:`osdmaptool <osdmaptool>`. :
ceph osd getmap -o file
.. prompt:: bash $
ceph osd getmap -o file
Write a copy of the crush map from the most recent OSD map to
file. ::
file. :
ceph osd getcrushmap -o file
.. prompt:: bash $
The foregoing is functionally equivalent to ::
ceph osd getcrushmap -o file
ceph osd getmap -o /tmp/osdmap
osdmaptool /tmp/osdmap --export-crush file
The foregoing is functionally equivalent to :
.. prompt:: bash $
ceph osd getmap -o /tmp/osdmap
osdmaptool /tmp/osdmap --export-crush file
Dump the OSD map. Valid formats for ``-f`` are ``plain``, ``json``, ``json-pretty``,
``xml``, and ``xml-pretty``. If no ``--format`` option is given, the OSD map is
dumped as plain text. As above, JSON format is best for tools, scripting, and other automation. ::
dumped as plain text. As above, JSON format is best for tools, scripting, and other automation. :
ceph osd dump [--format {format}]
.. prompt:: bash $
ceph osd dump [--format {format}]
Dump the OSD map as a tree with one line per OSD containing weight
and state. ::
and state. :
ceph osd tree [--format {format}]
.. prompt:: bash $
Find out where a specific object is or would be stored in the system::
ceph osd tree [--format {format}]
ceph osd map <pool-name> <object-name>
Find out where a specific object is or would be stored in the system:
.. prompt:: bash $
ceph osd map <pool-name> <object-name>
Add or move a new item (OSD) with the given id/name/weight at the specified
location. ::
location. :
ceph osd crush set {id} {weight} [{loc1} [{loc2} ...]]
.. prompt:: bash $
Remove an existing item (OSD) from the CRUSH map. ::
ceph osd crush set {id} {weight} [{loc1} [{loc2} ...]]
ceph osd crush remove {name}
Remove an existing item (OSD) from the CRUSH map. :
Remove an existing bucket from the CRUSH map. ::
.. prompt:: bash $
ceph osd crush remove {bucket-name}
ceph osd crush remove {name}
Move an existing bucket from one position in the hierarchy to another. ::
Remove an existing bucket from the CRUSH map. :
ceph osd crush move {id} {loc1} [{loc2} ...]
.. prompt:: bash $
Set the weight of the item given by ``{name}`` to ``{weight}``. ::
ceph osd crush remove {bucket-name}
ceph osd crush reweight {name} {weight}
Move an existing bucket from one position in the hierarchy to another. :
Mark an OSD as ``lost``. This may result in permanent data loss. Use with caution. ::
.. prompt:: bash $
ceph osd lost {id} [--yes-i-really-mean-it]
ceph osd crush move {id} {loc1} [{loc2} ...]
Set the weight of the item given by ``{name}`` to ``{weight}``. :
.. prompt:: bash $
ceph osd crush reweight {name} {weight}
Mark an OSD as ``lost``. This may result in permanent data loss. Use with caution. :
.. prompt:: bash $
ceph osd lost {id} [--yes-i-really-mean-it]
Create a new OSD. If no UUID is given, it will be set automatically when the OSD
starts up. ::
starts up. :
ceph osd create [{uuid}]
.. prompt:: bash $
Remove the given OSD(s). ::
ceph osd create [{uuid}]
ceph osd rm [{id}...]
Remove the given OSD(s). :
Query the current ``max_osd`` parameter in the OSD map. ::
.. prompt:: bash $
ceph osd getmaxosd
ceph osd rm [{id}...]
Import the given crush map. ::
Query the current ``max_osd`` parameter in the OSD map. :
ceph osd setcrushmap -i file
.. prompt:: bash $
ceph osd getmaxosd
Import the given crush map. :
.. prompt:: bash $
ceph osd setcrushmap -i file
Set the ``max_osd`` parameter in the OSD map. This defaults to 10000 now so
most admins will never need to adjust this. ::
most admins will never need to adjust this. :
ceph osd setmaxosd
.. prompt:: bash $
Mark OSD ``{osd-num}`` down. ::
ceph osd setmaxosd
ceph osd down {osd-num}
Mark OSD ``{osd-num}`` down. :
Mark OSD ``{osd-num}`` out of the distribution (i.e. allocated no data). ::
.. prompt:: bash $
ceph osd out {osd-num}
ceph osd down {osd-num}
Mark ``{osd-num}`` in the distribution (i.e. allocated data). ::
Mark OSD ``{osd-num}`` out of the distribution (i.e. allocated no data). :
ceph osd in {osd-num}
.. prompt:: bash $
ceph osd out {osd-num}
Mark ``{osd-num}`` in the distribution (i.e. allocated data). :
.. prompt:: bash $
ceph osd in {osd-num}
Set or clear the pause flags in the OSD map. If set, no IO requests
will be sent to any OSD. Clearing the flags via unpause results in
resending pending requests. ::
resending pending requests. :
ceph osd pause
ceph osd unpause
.. prompt:: bash $
ceph osd pause
ceph osd unpause
Set the override weight (reweight) of ``{osd-num}`` to ``{weight}``. Two OSDs with the
same weight will receive roughly the same number of I/O requests and
@ -209,9 +275,11 @@ otherwise live on this drive. It does not change weights assigned
to the buckets above the OSD in the crush map, and is a corrective
measure in case the normal CRUSH distribution is not working out quite
right. For instance, if one of your OSDs is at 90% and the others are
at 50%, you could reduce this weight to compensate. ::
at 50%, you could reduce this weight to compensate. :
ceph osd reweight {osd-num} {weight}
.. prompt:: bash $
ceph osd reweight {osd-num} {weight}
Balance OSD fullness by reducing the override weight of OSDs which are
overly utilized. Note that these override aka ``reweight`` values
@ -219,9 +287,11 @@ default to 1.00000 and are relative only to each other; they not absolute.
It is crucial to distinguish them from CRUSH weights, which reflect the
absolute capacity of a bucket in TiB. By default this command adjusts
override weight on OSDs which have + or - 20% of the average utilization,
but if you include a ``threshold`` that percentage will be used instead. ::
but if you include a ``threshold`` that percentage will be used instead. :
ceph osd reweight-by-utilization [threshold [max_change [max_osds]]] [--no-increasing]
.. prompt:: bash $
ceph osd reweight-by-utilization [threshold [max_change [max_osds]]] [--no-increasing]
To limit the step by which any OSD's reweight will be changed, specify
``max_change`` which defaults to 0.05. To limit the number of OSDs that will
@ -230,9 +300,11 @@ parameters can speed leveling of OSD utilization, at the potential cost of
greater impact on client operations due to more data moving at once.
To determine which and how many PGs and OSDs will be affected by a given invocation
you can test before executing. ::
you can test before executing. :
ceph osd test-reweight-by-utilization [threshold [max_change max_osds]] [--no-increasing]
.. prompt:: bash $
ceph osd test-reweight-by-utilization [threshold [max_change max_osds]] [--no-increasing]
Adding ``--no-increasing`` to either command prevents increasing any
override weights that are currently < 1.00000. This can be useful when
@ -243,33 +315,46 @@ Deployments utilizing Nautilus (or later revisions of Luminous and Mimic)
that have no pre-Luminous cients may instead wish to instead enable the
`balancer`` module for ``ceph-mgr``.
Add/remove an IP address to/from the blocklist. When adding an address,
Add/remove an IP address or CIDR range to/from the blocklist.
When adding to the blocklist,
you can specify how long it should be blocklisted in seconds; otherwise,
it will default to 1 hour. A blocklisted address is prevented from
connecting to any OSD. Blocklisting is most often used to prevent a
lagging metadata server from making bad changes to data on the OSDs.
connecting to any OSD. If you blocklist an IP or range containing an OSD, be aware
that OSD will also be prevented from performing operations on its peers where it
acts as a client. (This includes tiering and copy-from functionality.)
If you want to blocklist a range (in CIDR format), you may do so by
including the ``range`` keyword.
These commands are mostly only useful for failure testing, as
blocklists are normally maintained automatically and shouldn't need
manual intervention. ::
manual intervention. :
ceph osd blocklist add ADDRESS[:source_port] [TIME]
ceph osd blocklist rm ADDRESS[:source_port]
.. prompt:: bash $
Creates/deletes a snapshot of a pool. ::
ceph osd blocklist ["range"] add ADDRESS[:source_port][/netmask_bits] [TIME]
ceph osd blocklist ["range"] rm ADDRESS[:source_port][/netmask_bits]
ceph osd pool mksnap {pool-name} {snap-name}
ceph osd pool rmsnap {pool-name} {snap-name}
Creates/deletes a snapshot of a pool. :
Creates/deletes/renames a storage pool. ::
.. prompt:: bash $
ceph osd pool create {pool-name} [pg_num [pgp_num]]
ceph osd pool delete {pool-name} [{pool-name} --yes-i-really-really-mean-it]
ceph osd pool rename {old-name} {new-name}
ceph osd pool mksnap {pool-name} {snap-name}
ceph osd pool rmsnap {pool-name} {snap-name}
Changes a pool setting. ::
Creates/deletes/renames a storage pool. :
ceph osd pool set {pool-name} {field} {value}
.. prompt:: bash $
ceph osd pool create {pool-name} [pg_num [pgp_num]]
ceph osd pool delete {pool-name} [{pool-name} --yes-i-really-really-mean-it]
ceph osd pool rename {old-name} {new-name}
Changes a pool setting. :
.. prompt:: bash $
ceph osd pool set {pool-name} {field} {value}
Valid fields are:
@ -278,9 +363,11 @@ Valid fields are:
* ``pgp_num``: Effective number when calculating pg placement.
* ``crush_rule``: rule number for mapping placement.
Get the value of a pool setting. ::
Get the value of a pool setting. :
ceph osd pool get {pool-name} {field}
.. prompt:: bash $
ceph osd pool get {pool-name} {field}
Valid fields are:
@ -288,49 +375,67 @@ Valid fields are:
* ``pgp_num``: Effective number of placement groups when calculating placement.
Sends a scrub command to OSD ``{osd-num}``. To send the command to all OSDs, use ``*``. ::
Sends a scrub command to OSD ``{osd-num}``. To send the command to all OSDs, use ``*``. :
ceph osd scrub {osd-num}
.. prompt:: bash $
Sends a repair command to OSD.N. To send the command to all OSDs, use ``*``. ::
ceph osd scrub {osd-num}
ceph osd repair N
Sends a repair command to OSD.N. To send the command to all OSDs, use ``*``. :
.. prompt:: bash $
ceph osd repair N
Runs a simple throughput benchmark against OSD.N, writing ``TOTAL_DATA_BYTES``
in write requests of ``BYTES_PER_WRITE`` each. By default, the test
writes 1 GB in total in 4-MB increments.
The benchmark is non-destructive and will not overwrite existing live
OSD data, but might temporarily affect the performance of clients
concurrently accessing the OSD. ::
concurrently accessing the OSD. :
ceph tell osd.N bench [TOTAL_DATA_BYTES] [BYTES_PER_WRITE]
.. prompt:: bash $
To clear an OSD's caches between benchmark runs, use the 'cache drop' command ::
ceph tell osd.N bench [TOTAL_DATA_BYTES] [BYTES_PER_WRITE]
ceph tell osd.N cache drop
To clear an OSD's caches between benchmark runs, use the 'cache drop' command :
To get the cache statistics of an OSD, use the 'cache status' command ::
.. prompt:: bash $
ceph tell osd.N cache status
ceph tell osd.N cache drop
To get the cache statistics of an OSD, use the 'cache status' command :
.. prompt:: bash $
ceph tell osd.N cache status
MDS Subsystem
=============
Change configuration parameters on a running mds. ::
Change configuration parameters on a running mds. :
ceph tell mds.{mds-id} config set {setting} {value}
.. prompt:: bash $
Example::
ceph tell mds.{mds-id} config set {setting} {value}
ceph tell mds.0 config set debug_ms 1
Example:
Enables debug messages. ::
.. prompt:: bash $
ceph mds stat
ceph tell mds.0 config set debug_ms 1
Displays the status of all metadata servers. ::
Enables debug messages. :
ceph mds fail 0
.. prompt:: bash $
ceph mds stat
Displays the status of all metadata servers. :
.. prompt:: bash $
ceph mds fail 0
Marks the active MDS as failed, triggering failover to a standby if present.
@ -340,18 +445,24 @@ Marks the active MDS as failed, triggering failover to a standby if present.
Mon Subsystem
=============
Show monitor stats::
Show monitor stats:
ceph mon stat
.. prompt:: bash $
ceph mon stat
::
e2: 3 mons at {a=127.0.0.1:40000/0,b=127.0.0.1:40001/0,c=127.0.0.1:40002/0}, election epoch 6, quorum 0,1,2 a,b,c
The ``quorum`` list at the end lists monitor nodes that are part of the current quorum.
This is also available more directly::
This is also available more directly:
ceph quorum_status -f json-pretty
.. prompt:: bash $
ceph quorum_status -f json-pretty
.. code-block:: javascript
@ -405,9 +516,11 @@ This is also available more directly::
The above will block until a quorum is reached.
For a status of just a single monitor::
For a status of just a single monitor:
ceph tell mon.[name] mon_status
.. prompt:: bash $
ceph tell mon.[name] mon_status
where the value of ``[name]`` can be taken from ``ceph quorum_status``. Sample
output::
@ -469,10 +582,14 @@ output::
}
}
A dump of the monitor state::
A dump of the monitor state:
.. prompt:: bash $
ceph mon dump
::
dumped monmap epoch 2
epoch 2
fsid ba807e74-b64f-4b72-b43f-597dfe60ddbc

View File

@ -35,7 +35,9 @@ Pool Values`_.
Get a CRUSH Map
---------------
To get the CRUSH map for your cluster, execute the following::
To get the CRUSH map for your cluster, execute the following:
.. prompt:: bash $
ceph osd getcrushmap -o {compiled-crushmap-filename}
@ -48,7 +50,9 @@ edit it.
Decompile a CRUSH Map
---------------------
To decompile a CRUSH map, execute the following::
To decompile a CRUSH map, execute the following:
.. prompt:: bash $
crushtool -d {compiled-crushmap-filename} -o {decompiled-crushmap-filename}
@ -57,7 +61,9 @@ To decompile a CRUSH map, execute the following::
Recompile a CRUSH Map
---------------------
To compile a CRUSH map, execute the following::
To compile a CRUSH map, execute the following:
.. prompt:: bash $
crushtool -c {decompiled-crushmap-filename} -o {compiled-crushmap-filename}
@ -66,7 +72,9 @@ To compile a CRUSH map, execute the following::
Set the CRUSH Map
-----------------
To set the CRUSH map for your cluster, execute the following::
To set the CRUSH map for your cluster, execute the following:
.. prompt:: bash $
ceph osd setcrushmap -i {compiled-crushmap-filename}
@ -118,14 +126,22 @@ Devices may also have a *device class* associated with them (e.g.,
``hdd`` or ``ssd``), allowing them to be conveniently targeted by a
crush rule.
.. prompt:: bash #
devices
::
# devices
device {num} {osd.name} [class {class}]
For example::
For example:
.. prompt:: bash #
devices
::
# devices
device 0 osd.0 class ssd
device 1 osd.1 class hdd
device 2 osd.2
@ -136,10 +152,6 @@ is normally a single storage device, a pair of devices (for example,
one for data and one for a journal or metadata), or in some cases a
small RAID device.
CRUSH Map Bucket Types
----------------------
@ -157,9 +169,9 @@ media.
To add a bucket type to the CRUSH map, create a new line under your list of
bucket types. Enter ``type`` followed by a unique numeric ID and a bucket name.
By convention, there is one leaf bucket and it is ``type 0``; however, you may
give it any name you like (e.g., osd, disk, drive, storage, etc.)::
give it any name you like (e.g., osd, disk, drive, storage)::
#types
# types
type {num} {bucket-name}
For example::
@ -199,8 +211,8 @@ distribution units, pods, rows, rooms, and data centers. With the exception of
the leaf nodes representing OSDs, the rest of the hierarchy is arbitrary, and
you may define it according to your own needs.
We recommend adapting your CRUSH map to your firms's hardware naming conventions
and using instances names that reflect the physical hardware. Your naming
We recommend adapting your CRUSH map to your firm's hardware naming conventions
and using instance names that reflect the physical hardware. Your naming
practice can make it easier to administer the cluster and troubleshoot
problems when an OSD and/or other hardware malfunctions and the administrator
need access to physical hardware.
@ -655,29 +667,38 @@ There are three types of transformations possible:
single bucket. For example, in the previous example, we want the
``ssd`` bucket to be mapped to the ``default`` bucket.
The final command to convert the map comprised of the above fragments would be something like::
The final command to convert the map comprising the above fragments would be something like:
$ ceph osd getcrushmap -o original
$ crushtool -i original --reclassify \
--set-subtree-class default hdd \
--reclassify-root default hdd \
--reclassify-bucket %-ssd ssd default \
--reclassify-bucket ssd ssd default \
-o adjusted
.. prompt:: bash $
In order to ensure that the conversion is correct, there is a ``--compare`` command that will test a large sample of inputs to the CRUSH map and ensure that the same result comes back out. These inputs are controlled by the same options that apply to the ``--test`` command. For the above example,::
ceph osd getcrushmap -o original
crushtool -i original --reclassify \
--set-subtree-class default hdd \
--reclassify-root default hdd \
--reclassify-bucket %-ssd ssd default \
--reclassify-bucket ssd ssd default \
-o adjusted
In order to ensure that the conversion is correct, there is a ``--compare`` command that will test a large sample of inputs against the CRUSH map and check that the same result is output. These inputs are controlled by the same options that apply to the ``--test`` command. For the above example,:
.. prompt:: bash $
crushtool -i original --compare adjusted
::
$ crushtool -i original --compare adjusted
rule 0 had 0/10240 mismatched mappings (0)
rule 1 had 0/10240 mismatched mappings (0)
maps appear equivalent
If there were difference, you'd see what ratio of inputs are remapped
in the parentheses.
If there were differences, the ratio of remapped inputs would be reported in
the parentheses.
If you are satisfied with the adjusted map, you can apply it to the cluster with something like::
When you are satisfied with the adjusted map, apply it to the cluster with a command of the form:
ceph osd setcrushmap -i adjusted
.. prompt:: bash $
ceph osd setcrushmap -i adjusted
Tuning CRUSH, the hard way
--------------------------
@ -686,7 +707,9 @@ If you can ensure that all clients are running recent code, you can
adjust the tunables by extracting the CRUSH map, modifying the values,
and reinjecting it into the cluster.
* Extract the latest CRUSH map::
* Extract the latest CRUSH map:
.. prompt:: bash $
ceph osd getcrushmap -o /tmp/crush
@ -694,19 +717,25 @@ and reinjecting it into the cluster.
for both large and small clusters we tested with. You will need to
additionally specify the ``--enable-unsafe-tunables`` argument to
``crushtool`` for this to work. Please use this option with
extreme care.::
extreme care.:
crushtool -i /tmp/crush --set-choose-local-tries 0 --set-choose-local-fallback-tries 0 --set-choose-total-tries 50 -o /tmp/crush.new
.. prompt:: bash $
* Reinject modified map::
crushtool -i /tmp/crush --set-choose-local-tries 0 --set-choose-local-fallback-tries 0 --set-choose-total-tries 50 -o /tmp/crush.new
ceph osd setcrushmap -i /tmp/crush.new
* Reinject modified map:
.. prompt:: bash $
ceph osd setcrushmap -i /tmp/crush.new
Legacy values
-------------
For reference, the legacy values for the CRUSH tunables can be set
with::
with:
.. prompt:: bash $
crushtool -i /tmp/crush --set-choose-local-tries 2 --set-choose-local-fallback-tries 5 --set-choose-total-tries 19 --set-chooseleaf-descend-once 0 --set-chooseleaf-vary-r 0 -o /tmp/crush.legacy
@ -715,4 +744,4 @@ Further, as noted above, be careful running old versions of the
``ceph-osd`` daemon after reverting to legacy values as the feature
bit is not perfectly enforced.
.. _CRUSH - Controlled, Scalable, Decentralized Placement of Replicated Data: https://ceph.com/wp-content/uploads/2016/08/weil-crush-sc06.pdf
.. _CRUSH - Controlled, Scalable, Decentralized Placement of Replicated Data: https://ceph.io/assets/pdfs/weil-crush-sc06.pdf

View File

@ -184,9 +184,11 @@ will be the total of all devices contained beneath it. Normally
weights are in units of terabytes (TB).
You can get a simple view the of CRUSH hierarchy for your cluster,
including weights, with::
including weights, with:
ceph osd tree
.. prompt:: bash $
ceph osd tree
Rules
-----
@ -208,13 +210,17 @@ erasure coded), the *failure domain*, and optionally a *device class*.
In rare cases rules must be written by hand by manually editing the
CRUSH map.
You can see what rules are defined for your cluster with::
You can see what rules are defined for your cluster with:
ceph osd crush rule ls
.. prompt:: bash $
You can view the contents of the rules with::
ceph osd crush rule ls
ceph osd crush rule dump
You can view the contents of the rules with:
.. prompt:: bash $
ceph osd crush rule dump
Device classes
--------------
@ -224,34 +230,44 @@ default, OSDs automatically set their class at startup to
`hdd`, `ssd`, or `nvme` based on the type of device they are backed
by.
The device class for one or more OSDs can be explicitly set with::
The device class for one or more OSDs can be explicitly set with:
ceph osd crush set-device-class <class> <osd-name> [...]
.. prompt:: bash $
ceph osd crush set-device-class <class> <osd-name> [...]
Once a device class is set, it cannot be changed to another class
until the old class is unset with::
until the old class is unset with:
ceph osd crush rm-device-class <osd-name> [...]
.. prompt:: bash $
ceph osd crush rm-device-class <osd-name> [...]
This allows administrators to set device classes without the class
being changed on OSD restart or by some other script.
A placement rule that targets a specific device class can be created with::
A placement rule that targets a specific device class can be created with:
ceph osd crush rule create-replicated <rule-name> <root> <failure-domain> <class>
.. prompt:: bash $
A pool can then be changed to use the new rule with::
ceph osd crush rule create-replicated <rule-name> <root> <failure-domain> <class>
ceph osd pool set <pool-name> crush_rule <rule-name>
A pool can then be changed to use the new rule with:
.. prompt:: bash $
ceph osd pool set <pool-name> crush_rule <rule-name>
Device classes are implemented by creating a "shadow" CRUSH hierarchy
for each device class in use that contains only devices of that class.
CRUSH rules can then distribute data over the shadow hierarchy.
This approach is fully backward compatible with
old Ceph clients. You can view the CRUSH hierarchy with shadow items
with::
with:
ceph osd crush tree --show-shadow
.. prompt:: bash $
ceph osd crush tree --show-shadow
For older clusters created before Luminous that relied on manually
crafted CRUSH maps to maintain per-device-type hierarchies, there is a
@ -295,9 +311,11 @@ There are two types of weight sets supported:
When weight sets are in use, the weights associated with each node in
the hierarchy is visible as a separate column (labeled either
``(compat)`` or the pool name) from the command::
``(compat)`` or the pool name) from the command:
ceph osd tree
.. prompt:: bash $
ceph osd tree
When both *compat* and *per-pool* weight sets are in use, data
placement for a particular pool will use its own per-pool weight set
@ -320,9 +338,11 @@ Add/Move an OSD
.. note: OSDs are normally automatically added to the CRUSH map when
the OSD is created. This command is rarely needed.
To add or move an OSD in the CRUSH map of a running cluster::
To add or move an OSD in the CRUSH map of a running cluster:
ceph osd crush set {name} {weight} root={root} [{bucket-type}={bucket-name} ...]
.. prompt:: bash $
ceph osd crush set {name} {weight} root={root} [{bucket-type}={bucket-name} ...]
Where:
@ -359,9 +379,11 @@ Where:
The following example adds ``osd.0`` to the hierarchy, or moves the
OSD from a previous location. ::
OSD from a previous location:
ceph osd crush set osd.0 1.0 root=default datacenter=dc1 room=room1 row=foo rack=bar host=foo-bar-1
.. prompt:: bash $
ceph osd crush set osd.0 1.0 root=default datacenter=dc1 room=room1 row=foo rack=bar host=foo-bar-1
Adjust OSD weight
@ -372,9 +394,11 @@ Adjust OSD weight
is rarely needed.
To adjust an OSD's CRUSH weight in the CRUSH map of a running cluster, execute
the following::
the following:
ceph osd crush reweight {name} {weight}
.. prompt:: bash $
ceph osd crush reweight {name} {weight}
Where:
@ -403,9 +427,11 @@ Remove an OSD
``ceph osd purge`` command. This command is rarely needed.
To remove an OSD from the CRUSH map of a running cluster, execute the
following::
following:
ceph osd crush remove {name}
.. prompt:: bash $
ceph osd crush remove {name}
Where:
@ -431,9 +457,11 @@ Add a Bucket
``default`` or other root as described below.
To add a bucket in the CRUSH map of a running cluster, execute the
``ceph osd crush add-bucket`` command::
``ceph osd crush add-bucket`` command:
ceph osd crush add-bucket {bucket-name} {bucket-type}
.. prompt:: bash $
ceph osd crush add-bucket {bucket-name} {bucket-type}
Where:
@ -453,17 +481,21 @@ Where:
:Example: ``rack``
The following example adds the ``rack12`` bucket to the hierarchy::
The following example adds the ``rack12`` bucket to the hierarchy:
ceph osd crush add-bucket rack12 rack
.. prompt:: bash $
ceph osd crush add-bucket rack12 rack
Move a Bucket
-------------
To move a bucket to a different location or position in the CRUSH map
hierarchy, execute the following::
hierarchy, execute the following:
ceph osd crush move {bucket-name} {bucket-type}={bucket-name}, [...]
.. prompt:: bash $
ceph osd crush move {bucket-name} {bucket-type}={bucket-name}, [...]
Where:
@ -484,9 +516,11 @@ Where:
Remove a Bucket
---------------
To remove a bucket from the CRUSH hierarchy, execute the following::
To remove a bucket from the CRUSH hierarchy, execute the following:
ceph osd crush remove {bucket-name}
.. prompt:: bash $
ceph osd crush remove {bucket-name}
.. note:: A bucket must be empty before removing it from the CRUSH hierarchy.
@ -499,9 +533,11 @@ Where:
:Required: Yes
:Example: ``rack12``
The following example removes the ``rack12`` bucket from the hierarchy::
The following example removes the ``rack12`` bucket from the hierarchy:
ceph osd crush remove rack12
.. prompt:: bash $
ceph osd crush remove rack12
Creating a compat weight set
----------------------------
@ -509,24 +545,32 @@ Creating a compat weight set
.. note: This step is normally done automatically by the ``balancer``
module when enabled.
To create a *compat* weight set::
To create a *compat* weight set:
ceph osd crush weight-set create-compat
.. prompt:: bash $
Weights for the compat weight set can be adjusted with::
ceph osd crush weight-set create-compat
ceph osd crush weight-set reweight-compat {name} {weight}
Weights for the compat weight set can be adjusted with:
The compat weight set can be destroyed with::
.. prompt:: bash $
ceph osd crush weight-set rm-compat
ceph osd crush weight-set reweight-compat {name} {weight}
The compat weight set can be destroyed with:
.. prompt:: bash $
ceph osd crush weight-set rm-compat
Creating per-pool weight sets
-----------------------------
To create a weight set for a specific pool,::
To create a weight set for a specific pool:
ceph osd crush weight-set create {pool-name} {mode}
.. prompt:: bash $
ceph osd crush weight-set create {pool-name} {mode}
.. note:: Per-pool weight sets require that all servers and daemons
run Luminous v12.2.z or later.
@ -553,17 +597,23 @@ Where:
:Required: Yes
:Example: ``flat``
To adjust the weight of an item in a weight set::
To adjust the weight of an item in a weight set:
ceph osd crush weight-set reweight {pool-name} {item-name} {weight [...]}
.. prompt:: bash $
To list existing weight sets,::
ceph osd crush weight-set reweight {pool-name} {item-name} {weight [...]}
ceph osd crush weight-set ls
To list existing weight sets:
To remove a weight set,::
.. prompt:: bash $
ceph osd crush weight-set rm {pool-name}
ceph osd crush weight-set ls
To remove a weight set:
.. prompt:: bash $
ceph osd crush weight-set rm {pool-name}
Creating a rule for a replicated pool
-------------------------------------
@ -588,9 +638,11 @@ classify themselves as either ``hdd`` or ``ssd``, depending on the
underlying type of device being used. These classes can also be
customized.
To create a replicated rule,::
To create a replicated rule:
ceph osd crush rule create-replicated {name} {root} {failure-domain-type} [{class}]
.. prompt:: bash $
ceph osd crush rule create-replicated {name} {root} {failure-domain-type} [{class}]
Where:
@ -635,13 +687,17 @@ you must include this information in the *erasure code profile*. A CRUSH
rule will then be created from that either explicitly or automatically when
the profile is used to create a pool.
The erasure code profiles can be listed with::
The erasure code profiles can be listed with:
ceph osd erasure-code-profile ls
.. prompt:: bash $
An existing profile can be viewed with::
ceph osd erasure-code-profile ls
ceph osd erasure-code-profile get {profile-name}
An existing profile can be viewed with:
.. prompt:: bash $
ceph osd erasure-code-profile get {profile-name}
Normally profiles should never be modified; instead, a new profile
should be created and used when creating a new pool or creating a new
@ -659,9 +715,11 @@ The erasure code profile properties of interest are:
* **crush-device-class**: the device class on which to place data [default: none, meaning all devices are used].
* **k** and **m** (and, for the ``lrc`` plugin, **l**): these determine the number of erasure code shards, affecting the resulting CRUSH rule.
Once a profile is defined, you can create a CRUSH rule with::
Once a profile is defined, you can create a CRUSH rule with:
ceph osd crush rule create-erasure {name} {profile-name}
.. prompt:: bash $
ceph osd crush rule create-erasure {name} {profile-name}
.. note: When creating a new pool, it is not actually necessary to
explicitly create the rule. If the erasure code profile alone is
@ -671,9 +729,11 @@ Once a profile is defined, you can create a CRUSH rule with::
Deleting rules
--------------
Rules that are not in use by pools can be deleted with::
Rules that are not in use by pools can be deleted with:
ceph osd crush rule rm {rule-name}
.. prompt:: bash $
ceph osd crush rule rm {rule-name}
.. _crush-map-tunables:
@ -882,14 +942,18 @@ To make this warning go away, you have two options:
result in some data movement (possibly as much as 10%). This is the
preferred route, but should be taken with care on a production cluster
where the data movement may affect performance. You can enable optimal
tunables with::
tunables with:
.. prompt:: bash $
ceph osd crush tunables optimal
If things go poorly (e.g., too much load) and not very much
progress has been made, or there is a client compatibility problem
(old kernel CephFS or RBD clients, or pre-Bobtail ``librados``
clients), you can switch back with::
clients), you can switch back with:
.. prompt:: bash $
ceph osd crush tunables legacy
@ -899,7 +963,9 @@ To make this warning go away, you have two options:
mon warn on legacy crush tunables = false
For the change to take effect, you will need to restart the monitors, or
apply the option to running monitors with::
apply the option to running monitors with:
.. prompt:: bash $
ceph tell mon.\* config set mon_warn_on_legacy_crush_tunables false
@ -936,7 +1002,7 @@ sets known as *profiles*. As of the Octopus release these are:
* ``firefly``: the values supported by the firefly release
* ``hammer``: the values supported by the hammer release
* ``jewel``: the values supported by the jewel release
* ``optimal``: the best (ie optimal) values of the current version of Ceph
* ``optimal``: the best (i.e. optimal) values of the current version of Ceph
* ``default``: the default values of a new cluster installed from
scratch. These values, which depend on the current version of Ceph,
are hardcoded and are generally a mix of optimal and legacy values.
@ -944,17 +1010,18 @@ sets known as *profiles*. As of the Octopus release these are:
LTS release, or the most recent release for which we generally expect
most users to have up-to-date clients for.
You can apply a profile to a running cluster with the command::
You can apply a profile to a running cluster with the command:
ceph osd crush tunables {PROFILE}
.. prompt:: bash $
ceph osd crush tunables {PROFILE}
Note that this may result in data movement, potentially quite a bit. Study
release notes and documentation carefully before changing the profile on a
running cluster, and consider throttling recovery/backfill parameters to
limit the impact of a bolus of backfill.
.. _CRUSH - Controlled, Scalable, Decentralized Placement of Replicated Data: https://ceph.com/wp-content/uploads/2016/08/weil-crush-sc06.pdf
.. _CRUSH - Controlled, Scalable, Decentralized Placement of Replicated Data: https://ceph.io/assets/pdfs/weil-crush-sc06.pdf
Primary Affinity
@ -987,19 +1054,20 @@ interface bandwidth and CPU cycles more evenly.
By default, all ceph OSDs have primary affinity of ``1``, which indicates that
any OSD may act as a primary with equal probability.
You can reduce a Ceph OSD's primary affinity so that CRUSH is less likely to choose
the OSD as primary in a PG's acting set.::
You can reduce a Ceph OSD's primary affinity so that CRUSH is less likely to
choose the OSD as primary in a PG's acting set.:
ceph osd primary-affinity <osd-id> <weight>
.. prompt:: bash $
You may set an OSD's primary affinity to a real number in the range
``[0-1]``, where ``0`` indicates that the OSD may **NOT** be used as a primary
and ``1`` indicates that an OSD may be used as a primary. When the weight is
between these extremes, it is less likely that
CRUSH will select that OSD as a primary. The process for
selecting the lead OSD is more nuanced than a simple probability based on
relative affinity values, but measurable results can be achieved even with
first-order approximations of desirable values.
ceph osd primary-affinity <osd-id> <weight>
You may set an OSD's primary affinity to a real number in the range ``[0-1]``,
where ``0`` indicates that the OSD may **NOT** be used as a primary and ``1``
indicates that an OSD may be used as a primary. When the weight is between
these extremes, it is less likely that CRUSH will select that OSD as a primary.
The process for selecting the lead OSD is more nuanced than a simple
probability based on relative affinity values, but measurable results can be
achieved even with first-order approximations of desirable values.
Custom CRUSH Rules
------------------
@ -1052,7 +1120,6 @@ must not contain the same servers::
}
Note also that on failure of an SSD, requests to a PG will be served temporarily
from a (slower) HDD OSD until the PG's data has been replicated onto the replacement
primary SSD OSD.

View File

@ -1,4 +1,3 @@
.. _devices:
Device Management
@ -11,19 +10,25 @@ provide tools to predict and/or automatically respond to hardware failure.
Device tracking
---------------
You can query which storage devices are in use with::
You can query which storage devices are in use with:
ceph device ls
.. prompt:: bash $
You can also list devices by daemon or by host::
ceph device ls
ceph device ls-by-daemon <daemon>
ceph device ls-by-host <host>
You can also list devices by daemon or by host:
.. prompt:: bash $
ceph device ls-by-daemon <daemon>
ceph device ls-by-host <host>
For any individual device, you can query information about its
location and how it is being consumed with::
location and how it is being consumed with:
ceph device info <devid>
.. prompt:: bash $
ceph device info <devid>
Identifying physical devices
----------------------------
@ -34,18 +39,22 @@ failed disks easy and less error-prone. Use the following command::
device light on|off <devid> [ident|fault] [--force]
The ``<devid>`` parameter is the device identification. You can obtain this
information using the following command::
information using the following command:
ceph device ls
.. prompt:: bash $
ceph device ls
The ``[ident|fault]`` parameter is used to set the kind of light to blink.
By default, the `identification` light is used.
.. note::
This command needs the Cephadm or the Rook `orchestrator <https://docs.ceph.com/docs/master/mgr/orchestrator/#orchestrator-cli-module>`_ module enabled.
The orchestrator module enabled is shown by executing the following command::
The orchestrator module enabled is shown by executing the following command:
ceph orch status
.. prompt:: bash $
ceph orch status
The command behind the scene to blink the drive LEDs is `lsmcli`. If you need
to customize this command you can configure this via a Jinja2 template::
@ -77,40 +86,54 @@ or unrecoverable read errors. Other device types like SAS and NVMe
implement a similar set of metrics (via slightly different standards).
All of these can be collected by Ceph via the ``smartctl`` tool.
You can enable or disable health monitoring with::
You can enable or disable health monitoring with:
ceph device monitoring on
.. prompt:: bash $
or::
ceph device monitoring on
ceph device monitoring off
or:
.. prompt:: bash $
ceph device monitoring off
Scraping
--------
If monitoring is enabled, metrics will automatically be scraped at regular intervals. That interval can be configured with::
If monitoring is enabled, metrics will automatically be scraped at regular intervals. That interval can be configured with:
ceph config set mgr mgr/devicehealth/scrape_frequency <seconds>
.. prompt:: bash $
ceph config set mgr mgr/devicehealth/scrape_frequency <seconds>
The default is to scrape once every 24 hours.
You can manually trigger a scrape of all devices with::
You can manually trigger a scrape of all devices with:
.. prompt:: bash $
ceph device scrape-health-metrics
ceph device scrape-health-metrics
A single device can be scraped with::
A single device can be scraped with:
ceph device scrape-health-metrics <device-id>
.. prompt:: bash $
Or a single daemon's devices can be scraped with::
ceph device scrape-health-metrics <device-id>
ceph device scrape-daemon-health-metrics <who>
Or a single daemon's devices can be scraped with:
.. prompt:: bash $
ceph device scrape-daemon-health-metrics <who>
The stored health metrics for a device can be retrieved (optionally
for a specific timestamp) with::
for a specific timestamp) with:
ceph device get-health-metrics <devid> [sample-timestamp]
.. prompt:: bash $
ceph device get-health-metrics <devid> [sample-timestamp]
Failure prediction
------------------
@ -121,29 +144,39 @@ health metrics it collects. There are three modes:
* *none*: disable device failure prediction.
* *local*: use a pre-trained prediction model from the ceph-mgr daemon
The prediction mode can be configured with::
The prediction mode can be configured with:
ceph config set global device_failure_prediction_mode <mode>
.. prompt:: bash $
ceph config set global device_failure_prediction_mode <mode>
Prediction normally runs in the background on a periodic basis, so it
may take some time before life expectancy values are populated. You
can see the life expectancy of all devices in output from::
can see the life expectancy of all devices in output from:
ceph device ls
.. prompt:: bash $
You can also query the metadata for a specific device with::
ceph device ls
ceph device info <devid>
You can also query the metadata for a specific device with:
You can explicitly force prediction of a device's life expectancy with::
.. prompt:: bash $
ceph device predict-life-expectancy <devid>
ceph device info <devid>
You can explicitly force prediction of a device's life expectancy with:
.. prompt:: bash $
ceph device predict-life-expectancy <devid>
If you are not using Ceph's internal device failure prediction but
have some external source of information about device failures, you
can inform Ceph of a device's life expectancy with::
can inform Ceph of a device's life expectancy with:
ceph device set-life-expectancy <devid> <from> [<to>]
.. prompt:: bash $
ceph device set-life-expectancy <devid> <from> [<to>]
Life expectancies are expressed as a time interval so that
uncertainty can be expressed in the form of a wide interval. The
@ -156,9 +189,11 @@ The ``mgr/devicehealth/warn_threshold`` controls how soon an expected
device failure must be before we generate a health warning.
The stored life expectancy of all devices can be checked, and any
appropriate health alerts generated, with::
appropriate health alerts generated, with:
ceph device check-health
.. prompt:: bash $
ceph device check-health
Automatic Mitigation
--------------------

View File

@ -38,30 +38,35 @@ to achieve recovery from an OSD failure.
Erasure-code profile examples
=============================
An example configuration that can be used to observe reduced bandwidth usage::
An example configuration that can be used to observe reduced bandwidth usage:
$ ceph osd erasure-code-profile set CLAYprofile \
plugin=clay \
k=4 m=2 d=5 \
crush-failure-domain=host
$ ceph osd pool create claypool erasure CLAYprofile
.. prompt:: bash $
ceph osd erasure-code-profile set CLAYprofile \
plugin=clay \
k=4 m=2 d=5 \
crush-failure-domain=host
ceph osd pool create claypool erasure CLAYprofile
Creating a clay profile
=======================
To create a new clay code profile::
To create a new clay code profile:
ceph osd erasure-code-profile set {name} \
plugin=clay \
k={data-chunks} \
m={coding-chunks} \
[d={helper-chunks}] \
[scalar_mds={plugin-name}] \
[technique={technique-name}] \
[crush-failure-domain={bucket-type}] \
[directory={directory}] \
[--force]
.. prompt:: bash $
ceph osd erasure-code-profile set {name} \
plugin=clay \
k={data-chunks} \
m={coding-chunks} \
[d={helper-chunks}] \
[scalar_mds={plugin-name}] \
[technique={technique-name}] \
[crush-failure-domain={bucket-type}] \
[crush-device-class={device-class}] \
[directory={directory}] \
[--force]
Where:

View File

@ -9,18 +9,20 @@ library.
Create an isa profile
=====================
To create a new *isa* erasure code profile::
To create a new *isa* erasure code profile:
ceph osd erasure-code-profile set {name} \
plugin=isa \
technique={reed_sol_van|cauchy} \
[k={data-chunks}] \
[m={coding-chunks}] \
[crush-root={root}] \
[crush-failure-domain={bucket-type}] \
[crush-device-class={device-class}] \
[directory={directory}] \
[--force]
.. prompt:: bash $
ceph osd erasure-code-profile set {name} \
plugin=isa \
technique={reed_sol_van|cauchy} \
[k={data-chunks}] \
[m={coding-chunks}] \
[crush-root={root}] \
[crush-failure-domain={bucket-type}] \
[crush-device-class={device-class}] \
[directory={directory}] \
[--force]
Where:

View File

@ -13,18 +13,20 @@ understanding of the parameters.
Create a jerasure profile
=========================
To create a new *jerasure* erasure code profile::
To create a new *jerasure* erasure code profile:
.. prompt:: bash $
ceph osd erasure-code-profile set {name} \
plugin=jerasure \
k={data-chunks} \
m={coding-chunks} \
technique={reed_sol_van|reed_sol_r6_op|cauchy_orig|cauchy_good|liberation|blaum_roth|liber8tion} \
[crush-root={root}] \
[crush-failure-domain={bucket-type}] \
[crush-device-class={device-class}] \
[directory={directory}] \
[--force]
ceph osd erasure-code-profile set {name} \
plugin=jerasure \
k={data-chunks} \
m={coding-chunks} \
technique={reed_sol_van|reed_sol_r6_op|cauchy_orig|cauchy_good|liberation|blaum_roth|liber8tion} \
[crush-root={root}] \
[crush-failure-domain={bucket-type}] \
[crush-device-class={device-class}] \
[directory={directory}] \
[--force]
Where:

Some files were not shown because too many files have changed in this diff Show More