mirror of
https://git.proxmox.com/git/ceph.git
synced 2025-04-28 15:01:36 +00:00
import ceph quincy 17.2.6
Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
This commit is contained in:
parent
e04241aa9b
commit
39ae355f72
1
ceph/.github/CODEOWNERS
vendored
1
ceph/.github/CODEOWNERS
vendored
@ -110,6 +110,7 @@ README* @ceph/doc-writers
|
||||
/qa/workunits/cls/test_cls_lock.sh @ceph/rbd
|
||||
/qa/workunits/cls/test_cls_rbd.sh @ceph/rbd
|
||||
/qa/workunits/rbd @ceph/rbd
|
||||
/qa/workunits/windows @ceph/rbd
|
||||
/src/ceph-rbdnamer @ceph/rbd
|
||||
/src/cls/journal @ceph/rbd
|
||||
/src/cls/lock @ceph/rbd
|
||||
|
@ -1,10 +1,6 @@
|
||||
---
|
||||
# Read the Docs configuration file
|
||||
# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
|
||||
#
|
||||
# The pre_build command checks if we're building a named branch (i.e., not a PR).
|
||||
# If so, check out doc/releases from the main branch before building so
|
||||
# it's always up to date on docs.ceph.com/en/*.
|
||||
|
||||
version: 2
|
||||
formats: []
|
||||
@ -14,9 +10,7 @@ build:
|
||||
python: "3.8"
|
||||
apt_packages:
|
||||
- ditaa
|
||||
jobs:
|
||||
pre_build:
|
||||
- bash admin/rtd-checkout-main
|
||||
- graphviz
|
||||
python:
|
||||
install:
|
||||
- requirements: admin/doc-requirements.txt
|
||||
|
@ -1,7 +1,7 @@
|
||||
cmake_minimum_required(VERSION 3.16)
|
||||
|
||||
project(ceph
|
||||
VERSION 17.2.5
|
||||
VERSION 17.2.6
|
||||
LANGUAGES CXX C ASM)
|
||||
|
||||
cmake_policy(SET CMP0028 NEW)
|
||||
@ -344,9 +344,11 @@ and then jemalloc. If neither of then is found. use the one in libc.")
|
||||
if(ALLOCATOR)
|
||||
if(${ALLOCATOR} MATCHES "tcmalloc(_minimal)?")
|
||||
find_package(gperftools 2.6.2 REQUIRED)
|
||||
set(ALLOC_LIBS gperftools::${ALLOCATOR})
|
||||
set(HAVE_LIBTCMALLOC ON)
|
||||
elseif(${ALLOCATOR} STREQUAL "jemalloc")
|
||||
find_package(JeMalloc REQUIRED)
|
||||
set(ALLOC_LIBS JeMalloc::JeMalloc)
|
||||
set(HAVE_JEMALLOC 1)
|
||||
elseif(NOT ALLOCATOR STREQUAL "libc")
|
||||
message(FATAL_ERROR "Unsupported allocator selected: ${ALLOCATOR}")
|
||||
@ -359,8 +361,10 @@ else(ALLOCATOR)
|
||||
endif()
|
||||
if(gperftools_FOUND)
|
||||
set(ALLOCATOR tcmalloc)
|
||||
set(ALLOC_LIBS gperftools::tcmalloc)
|
||||
elseif(JeMalloc_FOUND)
|
||||
set(ALLOCATOR jemalloc)
|
||||
set(ALLOC_LIBS JeMalloc::JeMalloc)
|
||||
else()
|
||||
if(NOT FREEBSD)
|
||||
# FreeBSD already has jemalloc as its default allocator
|
||||
@ -369,6 +373,13 @@ else(ALLOCATOR)
|
||||
set(ALLOCATOR "libc")
|
||||
endif(gperftools_FOUND)
|
||||
endif(ALLOCATOR)
|
||||
if(NOT ALLOCATOR STREQUAL "libc")
|
||||
add_compile_options(
|
||||
$<$<COMPILE_LANGUAGE:CXX>:-fno-builtin-malloc>
|
||||
$<$<COMPILE_LANGUAGE:CXX>:-fno-builtin-calloc>
|
||||
$<$<COMPILE_LANGUAGE:CXX>:-fno-builtin-realloc>
|
||||
$<$<COMPILE_LANGUAGE:CXX>:-fno-builtin-free>)
|
||||
endif()
|
||||
|
||||
# Mingw generates incorrect entry points when using "-pie".
|
||||
if(WIN32 OR (HAVE_LIBTCMALLOC AND WITH_STATIC_LIBSTDCXX))
|
||||
|
@ -156,9 +156,9 @@ For Angular code, we follow the official Angular style guide:
|
||||
https://angular.io/guide/styleguide
|
||||
|
||||
To check whether your code is conformant with the style guide, we use a
|
||||
combination of TSLint, Codelyzer and Prettier:
|
||||
combination of ESLint, Codelyzer and Prettier:
|
||||
|
||||
https://palantir.github.io/tslint/
|
||||
https://eslint.org/
|
||||
http://codelyzer.com/
|
||||
https://prettier.io/
|
||||
|
||||
|
@ -1,3 +1,34 @@
|
||||
>=17.2.6
|
||||
--------
|
||||
|
||||
* `ceph mgr dump` command now outputs `last_failure_osd_epoch` and
|
||||
`active_clients` fields at the top level. Previously, these fields were
|
||||
output under `always_on_modules` field.
|
||||
|
||||
>=17.2.5
|
||||
--------
|
||||
|
||||
* RBD: The semantics of compare-and-write C++ API (`Image::compare_and_write`
|
||||
and `Image::aio_compare_and_write` methods) now match those of C API. Both
|
||||
compare and write steps operate only on `len` bytes even if the respective
|
||||
buffers are larger. The previous behavior of comparing up to the size of
|
||||
the compare buffer was prone to subtle breakage upon straddling a stripe
|
||||
unit boundary.
|
||||
* RBD: compare-and-write operation is no longer limited to 512-byte sectors.
|
||||
Assuming proper alignment, it now allows operating on stripe units (4M by
|
||||
default).
|
||||
* RBD: New `rbd_aio_compare_and_writev` API method to support scatter/gather
|
||||
on both compare and write buffers. This compliments existing `rbd_aio_readv`
|
||||
and `rbd_aio_writev` methods.
|
||||
* RBD: `rbd device unmap` command gained `--namespace` option. Support for
|
||||
namespaces was added to RBD in Nautilus 14.2.0 and it has been possible to
|
||||
map and unmap images in namespaces using the `image-spec` syntax since then
|
||||
but the corresponding option available in most other commands was missing.
|
||||
* CEPHFS: Rename the `mds_max_retries_on_remount_failure` option to
|
||||
`client_max_retries_on_remount_failure` and move it from mds.yaml.in to
|
||||
mds-client.yaml.in because this option was only used by MDS client from its
|
||||
birth.
|
||||
|
||||
>=17.2.4
|
||||
--------
|
||||
|
||||
@ -8,6 +39,15 @@
|
||||
* OSD: The issue of high CPU utilization during recovery/backfill operations
|
||||
has been fixed. For more details, see: https://tracker.ceph.com/issues/56530.
|
||||
|
||||
* Trimming of PGLog dups is now controlled by the size instead of the version.
|
||||
This fixes the PGLog inflation issue that was happening when the on-line
|
||||
(in OSD) trimming got jammed after a PG split operation. Also, a new off-line
|
||||
mechanism has been added: `ceph-objectstore-tool` got `trim-pg-log-dups` op
|
||||
that targets situations where OSD is unable to boot due to those inflated dups.
|
||||
If that is the case, in OSD logs the "You can be hit by THE DUPS BUG" warning
|
||||
will be visible.
|
||||
Relevant tracker: https://tracker.ceph.com/issues/53729
|
||||
|
||||
>=17.2.1
|
||||
|
||||
* The "BlueStore zero block detection" feature (first introduced to Quincy in
|
||||
|
@ -1,10 +0,0 @@
|
||||
# See .readthedocs.yml
|
||||
set -ex
|
||||
re='^[0-9]+$'
|
||||
if [[ $READTHEDOCS_VERSION =~ $re ]]; then
|
||||
echo "Building docs for PR $READTHEDOCS_VERSION. Will not check out doc/releases from main branch."
|
||||
else
|
||||
echo "Building docs for $READTHEDOCS_VERSION branch. Will check out doc/releases from main branch."
|
||||
git checkout origin/main -- doc/releases
|
||||
fi
|
||||
git status
|
@ -61,7 +61,11 @@
|
||||
%global _remote_tarball_prefix https://download.ceph.com/tarballs/
|
||||
%endif
|
||||
%if 0%{?suse_version}
|
||||
%ifarch s390x
|
||||
%bcond_with system_pmdk
|
||||
%else
|
||||
%bcond_without system_pmdk
|
||||
%endif
|
||||
%bcond_with amqp_endpoint
|
||||
%bcond_with cephfs_java
|
||||
%bcond_with kafka_endpoint
|
||||
@ -162,7 +166,7 @@
|
||||
# main package definition
|
||||
#################################################################################
|
||||
Name: ceph
|
||||
Version: 17.2.5
|
||||
Version: 17.2.6
|
||||
Release: 0%{?dist}
|
||||
%if 0%{?fedora} || 0%{?rhel}
|
||||
Epoch: 2
|
||||
@ -178,7 +182,7 @@ License: LGPL-2.1 and LGPL-3.0 and CC-BY-SA-3.0 and GPL-2.0 and BSL-1.0 and BSD-
|
||||
Group: System/Filesystems
|
||||
%endif
|
||||
URL: http://ceph.com/
|
||||
Source0: %{?_remote_tarball_prefix}ceph-17.2.5.tar.bz2
|
||||
Source0: %{?_remote_tarball_prefix}ceph-17.2.6.tar.bz2
|
||||
%if 0%{?suse_version}
|
||||
# _insert_obs_source_lines_here
|
||||
ExclusiveArch: x86_64 aarch64 ppc64le s390x
|
||||
@ -202,9 +206,12 @@ BuildRequires: selinux-policy-devel
|
||||
BuildRequires: gperf
|
||||
BuildRequires: cmake > 3.5
|
||||
BuildRequires: fuse-devel
|
||||
%if 0%{?fedora} || 0%{?suse_version} || 0%{?rhel} == 9
|
||||
%if 0%{?fedora} || 0%{?suse_version} > 1500 || 0%{?rhel} == 9
|
||||
BuildRequires: gcc-c++ >= 11
|
||||
%endif
|
||||
%if 0%{?suse_version} == 1500
|
||||
BuildRequires: gcc11-c++
|
||||
%endif
|
||||
%if 0%{?rhel} == 8
|
||||
BuildRequires: %{gts_prefix}-gcc-c++
|
||||
BuildRequires: %{gts_prefix}-build
|
||||
@ -648,6 +655,7 @@ Requires: python%{python3_pkgversion}-pecan
|
||||
Requires: python%{python3_pkgversion}-pyOpenSSL
|
||||
Requires: python%{python3_pkgversion}-requests
|
||||
Requires: python%{python3_pkgversion}-dateutil
|
||||
Requires: python%{python3_pkgversion}-setuptools
|
||||
%if 0%{?fedora} || 0%{?rhel} >= 8
|
||||
Requires: python%{python3_pkgversion}-cherrypy
|
||||
Requires: python%{python3_pkgversion}-pyyaml
|
||||
@ -1266,7 +1274,7 @@ This package provides Ceph default alerts for Prometheus.
|
||||
# common
|
||||
#################################################################################
|
||||
%prep
|
||||
%autosetup -p1 -n ceph-17.2.5
|
||||
%autosetup -p1 -n ceph-17.2.6
|
||||
|
||||
%build
|
||||
# Disable lto on systems that do not support symver attribute
|
||||
@ -1307,6 +1315,10 @@ env | sort
|
||||
mkdir -p %{_vpath_builddir}
|
||||
pushd %{_vpath_builddir}
|
||||
cmake .. \
|
||||
%if 0%{?suse_version} == 1500
|
||||
-DCMAKE_C_COMPILER=gcc-11 \
|
||||
-DCMAKE_CXX_COMPILER=g++-11 \
|
||||
%endif
|
||||
-DCMAKE_INSTALL_PREFIX=%{_prefix} \
|
||||
-DCMAKE_INSTALL_LIBDIR:PATH=%{_libdir} \
|
||||
-DCMAKE_INSTALL_LIBEXECDIR:PATH=%{_libexecdir} \
|
||||
@ -1461,7 +1473,7 @@ touch %{buildroot}%{_sharedstatedir}/cephadm/.ssh/authorized_keys
|
||||
chmod 0600 %{buildroot}%{_sharedstatedir}/cephadm/.ssh/authorized_keys
|
||||
|
||||
# firewall templates and /sbin/mount.ceph symlink
|
||||
%if 0%{?suse_version} && !0%{?usrmerged}
|
||||
%if 0%{?suse_version} && 0%{?suse_version} < 1550
|
||||
mkdir -p %{buildroot}/sbin
|
||||
ln -sf %{_sbindir}/mount.ceph %{buildroot}/sbin/mount.ceph
|
||||
%endif
|
||||
@ -1637,7 +1649,7 @@ exit 0
|
||||
%{_bindir}/rbd-replay-many
|
||||
%{_bindir}/rbdmap
|
||||
%{_sbindir}/mount.ceph
|
||||
%if 0%{?suse_version} && !0%{?usrmerged}
|
||||
%if 0%{?suse_version} && 0%{?suse_version} < 1550
|
||||
/sbin/mount.ceph
|
||||
%endif
|
||||
%if %{with lttng}
|
||||
|
@ -61,7 +61,11 @@
|
||||
%global _remote_tarball_prefix https://download.ceph.com/tarballs/
|
||||
%endif
|
||||
%if 0%{?suse_version}
|
||||
%ifarch s390x
|
||||
%bcond_with system_pmdk
|
||||
%else
|
||||
%bcond_without system_pmdk
|
||||
%endif
|
||||
%bcond_with amqp_endpoint
|
||||
%bcond_with cephfs_java
|
||||
%bcond_with kafka_endpoint
|
||||
@ -202,9 +206,12 @@ BuildRequires: selinux-policy-devel
|
||||
BuildRequires: gperf
|
||||
BuildRequires: cmake > 3.5
|
||||
BuildRequires: fuse-devel
|
||||
%if 0%{?fedora} || 0%{?suse_version} || 0%{?rhel} == 9
|
||||
%if 0%{?fedora} || 0%{?suse_version} > 1500 || 0%{?rhel} == 9
|
||||
BuildRequires: gcc-c++ >= 11
|
||||
%endif
|
||||
%if 0%{?suse_version} == 1500
|
||||
BuildRequires: gcc11-c++
|
||||
%endif
|
||||
%if 0%{?rhel} == 8
|
||||
BuildRequires: %{gts_prefix}-gcc-c++
|
||||
BuildRequires: %{gts_prefix}-build
|
||||
@ -648,6 +655,7 @@ Requires: python%{python3_pkgversion}-pecan
|
||||
Requires: python%{python3_pkgversion}-pyOpenSSL
|
||||
Requires: python%{python3_pkgversion}-requests
|
||||
Requires: python%{python3_pkgversion}-dateutil
|
||||
Requires: python%{python3_pkgversion}-setuptools
|
||||
%if 0%{?fedora} || 0%{?rhel} >= 8
|
||||
Requires: python%{python3_pkgversion}-cherrypy
|
||||
Requires: python%{python3_pkgversion}-pyyaml
|
||||
@ -1307,6 +1315,10 @@ env | sort
|
||||
mkdir -p %{_vpath_builddir}
|
||||
pushd %{_vpath_builddir}
|
||||
cmake .. \
|
||||
%if 0%{?suse_version} == 1500
|
||||
-DCMAKE_C_COMPILER=gcc-11 \
|
||||
-DCMAKE_CXX_COMPILER=g++-11 \
|
||||
%endif
|
||||
-DCMAKE_INSTALL_PREFIX=%{_prefix} \
|
||||
-DCMAKE_INSTALL_LIBDIR:PATH=%{_libdir} \
|
||||
-DCMAKE_INSTALL_LIBEXECDIR:PATH=%{_libexecdir} \
|
||||
@ -1461,7 +1473,7 @@ touch %{buildroot}%{_sharedstatedir}/cephadm/.ssh/authorized_keys
|
||||
chmod 0600 %{buildroot}%{_sharedstatedir}/cephadm/.ssh/authorized_keys
|
||||
|
||||
# firewall templates and /sbin/mount.ceph symlink
|
||||
%if 0%{?suse_version} && !0%{?usrmerged}
|
||||
%if 0%{?suse_version} && 0%{?suse_version} < 1550
|
||||
mkdir -p %{buildroot}/sbin
|
||||
ln -sf %{_sbindir}/mount.ceph %{buildroot}/sbin/mount.ceph
|
||||
%endif
|
||||
@ -1637,7 +1649,7 @@ exit 0
|
||||
%{_bindir}/rbd-replay-many
|
||||
%{_bindir}/rbdmap
|
||||
%{_sbindir}/mount.ceph
|
||||
%if 0%{?suse_version} && !0%{?usrmerged}
|
||||
%if 0%{?suse_version} && 0%{?suse_version} < 1550
|
||||
/sbin/mount.ceph
|
||||
%endif
|
||||
%if %{with lttng}
|
||||
|
@ -1,3 +1,9 @@
|
||||
ceph (17.2.6-1) stable; urgency=medium
|
||||
|
||||
* New upstream release
|
||||
|
||||
-- Ceph Release Team <ceph-maintainers@ceph.io> Wed, 05 Apr 2023 15:09:49 +0000
|
||||
|
||||
ceph (17.2.5-1) stable; urgency=medium
|
||||
|
||||
* New upstream release
|
||||
|
@ -2,8 +2,14 @@ function(build_fio)
|
||||
# we use an external project and copy the sources to bin directory to ensure
|
||||
# that object files are built outside of the source tree.
|
||||
include(ExternalProject)
|
||||
if(ALLOCATOR)
|
||||
set(FIO_EXTLIBS EXTLIBS=-l${ALLOCATOR})
|
||||
if(ALLOC_LIBS)
|
||||
get_target_property(alloc_lib_path
|
||||
${ALLOC_LIBS} IMPORTED_LOCATION)
|
||||
get_filename_component(alloc_lib_dir
|
||||
${alloc_lib_path} DIRECTORY)
|
||||
get_filename_component(alloc_lib_name
|
||||
${alloc_lib_path} NAME)
|
||||
set(FIO_EXTLIBS "EXTLIBS='-L${alloc_lib_dir} -l:${alloc_lib_name}'")
|
||||
endif()
|
||||
|
||||
include(FindMake)
|
||||
@ -20,7 +26,7 @@ function(build_fio)
|
||||
SOURCE_DIR ${source_dir}
|
||||
BUILD_IN_SOURCE 1
|
||||
CONFIGURE_COMMAND <SOURCE_DIR>/configure
|
||||
BUILD_COMMAND ${make_cmd} fio EXTFLAGS=-Wno-format-truncation ${FIO_EXTLIBS}
|
||||
BUILD_COMMAND ${make_cmd} fio EXTFLAGS=-Wno-format-truncation "${FIO_EXTLIBS}"
|
||||
INSTALL_COMMAND cp <BINARY_DIR>/fio ${CMAKE_BINARY_DIR}/bin
|
||||
LOG_CONFIGURE ON
|
||||
LOG_BUILD ON
|
||||
|
@ -93,6 +93,7 @@ Build-Depends: automake,
|
||||
tox <pkg.ceph.check>,
|
||||
python3-coverage <pkg.ceph.check>,
|
||||
python3-dateutil <pkg.ceph.check>,
|
||||
python3-pkg-resources <pkg.ceph.check>,
|
||||
python3-openssl <pkg.ceph.check>,
|
||||
python3-prettytable <pkg.ceph.check>,
|
||||
python3-requests <pkg.ceph.check>,
|
||||
|
20
ceph/doc/_static/css/custom.css
vendored
20
ceph/doc/_static/css/custom.css
vendored
@ -1,3 +1,23 @@
|
||||
dt {
|
||||
scroll-margin-top: 3em;
|
||||
}
|
||||
|
||||
h2 {
|
||||
scroll-margin-top: 4em;
|
||||
}
|
||||
|
||||
h3 {
|
||||
scroll-margin-top: 4em;
|
||||
}
|
||||
|
||||
section {
|
||||
scroll-margin-top: 4em;
|
||||
}
|
||||
|
||||
span {
|
||||
scroll-margin-top: 2em;
|
||||
}
|
||||
|
||||
ul.simple > li > ul > li:last-child {
|
||||
margin-block-end : 1em;
|
||||
}
|
||||
|
@ -13,6 +13,7 @@ replicate and redistribute data dynamically.
|
||||
|
||||
.. image:: images/stack.png
|
||||
|
||||
.. _arch-ceph-storage-cluster:
|
||||
|
||||
The Ceph Storage Cluster
|
||||
========================
|
||||
@ -59,7 +60,7 @@ service interfaces built on top of ``librados``.
|
||||
Storing Data
|
||||
------------
|
||||
|
||||
The Ceph Storage Cluster receives data from :term:`Ceph Clients`--whether it
|
||||
The Ceph Storage Cluster receives data from :term:`Ceph Client`\s--whether it
|
||||
comes through a :term:`Ceph Block Device`, :term:`Ceph Object Storage`, the
|
||||
:term:`Ceph File System` or a custom implementation you create using
|
||||
``librados``-- which is stored as RADOS objects. Each object is stored on an
|
||||
@ -80,7 +81,7 @@ stored in a monolithic database-like fashion.
|
||||
Ceph OSD Daemons store data as objects in a flat namespace (e.g., no
|
||||
hierarchy of directories). An object has an identifier, binary data, and
|
||||
metadata consisting of a set of name/value pairs. The semantics are completely
|
||||
up to :term:`Ceph Clients`. For example, CephFS uses metadata to store file
|
||||
up to :term:`Ceph Client`\s. For example, CephFS uses metadata to store file
|
||||
attributes such as the file owner, created date, last modified date, and so
|
||||
forth.
|
||||
|
||||
@ -135,6 +136,8 @@ Placement of Replicated Data`_.
|
||||
|
||||
.. index:: architecture; cluster map
|
||||
|
||||
.. _architecture_cluster_map:
|
||||
|
||||
Cluster Map
|
||||
~~~~~~~~~~~
|
||||
|
||||
@ -581,7 +584,7 @@ objects.
|
||||
Peering and Sets
|
||||
~~~~~~~~~~~~~~~~
|
||||
|
||||
In previous sections, we noted that Ceph OSD Daemons check each others
|
||||
In previous sections, we noted that Ceph OSD Daemons check each other's
|
||||
heartbeats and report back to the Ceph Monitor. Another thing Ceph OSD daemons
|
||||
do is called 'peering', which is the process of bringing all of the OSDs that
|
||||
store a Placement Group (PG) into agreement about the state of all of the
|
||||
@ -1619,13 +1622,13 @@ instance for high availability.
|
||||
|
||||
|
||||
|
||||
.. _RADOS - A Scalable, Reliable Storage Service for Petabyte-scale Storage Clusters: https://ceph.com/wp-content/uploads/2016/08/weil-rados-pdsw07.pdf
|
||||
.. _RADOS - A Scalable, Reliable Storage Service for Petabyte-scale Storage Clusters: https://ceph.io/assets/pdfs/weil-rados-pdsw07.pdf
|
||||
.. _Paxos: https://en.wikipedia.org/wiki/Paxos_(computer_science)
|
||||
.. _Monitor Config Reference: ../rados/configuration/mon-config-ref
|
||||
.. _Monitoring OSDs and PGs: ../rados/operations/monitoring-osd-pg
|
||||
.. _Heartbeats: ../rados/configuration/mon-osd-interaction
|
||||
.. _Monitoring OSDs: ../rados/operations/monitoring-osd-pg/#monitoring-osds
|
||||
.. _CRUSH - Controlled, Scalable, Decentralized Placement of Replicated Data: https://ceph.com/wp-content/uploads/2016/08/weil-crush-sc06.pdf
|
||||
.. _CRUSH - Controlled, Scalable, Decentralized Placement of Replicated Data: https://ceph.io/assets/pdfs/weil-crush-sc06.pdf
|
||||
.. _Data Scrubbing: ../rados/configuration/osd-config-ref#scrubbing
|
||||
.. _Report Peering Failure: ../rados/configuration/mon-osd-interaction#osds-report-peering-failure
|
||||
.. _Troubleshooting Peering Failure: ../rados/troubleshooting/troubleshooting-pg#placement-group-down-peering-failure
|
||||
|
@ -13,7 +13,7 @@ understand what OSD is enabled and needs to be mounted.
|
||||
.. note:: The execution of this call is fully idempotent, and there is no
|
||||
side-effects when running multiple times
|
||||
|
||||
For OSDs deployed by cephadm, please refer to :ref:cephadm-osd-activate:
|
||||
For OSDs deployed by cephadm, please refer to :ref:`cephadm-osd-activate`
|
||||
instead.
|
||||
|
||||
New OSDs
|
||||
@ -29,7 +29,7 @@ need to be supplied. For example::
|
||||
Activating all OSDs
|
||||
-------------------
|
||||
|
||||
.. note:: For OSDs deployed by cephadm, please refer to :ref:cephadm-osd-activate:
|
||||
.. note:: For OSDs deployed by cephadm, please refer to :ref:`cephadm-osd-activate`
|
||||
instead.
|
||||
|
||||
It is possible to activate all existing OSDs at once by using the ``--all``
|
||||
|
@ -4,45 +4,41 @@ Encryption
|
||||
==========
|
||||
|
||||
Logical volumes can be encrypted using ``dmcrypt`` by specifying the
|
||||
``--dmcrypt`` flag when creating OSDs. Encryption can be done in different ways,
|
||||
specially with LVM. ``ceph-volume`` is somewhat opinionated with the way it
|
||||
sets up encryption with logical volumes so that the process is consistent and
|
||||
``--dmcrypt`` flag when creating OSDs. When using LVM, logical volumes can be
|
||||
encrypted in different ways. ``ceph-volume`` does not offer as many options as
|
||||
LVM does, but it encrypts logical volumes in a way that is consistent and
|
||||
robust.
|
||||
|
||||
In this case, ``ceph-volume lvm`` follows these constraints:
|
||||
In this case, ``ceph-volume lvm`` follows this constraint:
|
||||
|
||||
* only LUKS (version 1) is used
|
||||
* Logical Volumes are encrypted, while their underlying PVs (physical volumes)
|
||||
aren't
|
||||
* Non-LVM devices like partitions are also encrypted with the same OSD key
|
||||
* Non-LVM devices (such as partitions) are encrypted with the same OSD key.
|
||||
|
||||
|
||||
LUKS
|
||||
----
|
||||
There are currently two versions of LUKS, 1 and 2. Version 2 is a bit easier
|
||||
to implement but not widely available in all distros Ceph supports. LUKS 1 is
|
||||
not going to be deprecated in favor of LUKS 2, so in order to have as wide
|
||||
support as possible, ``ceph-volume`` uses LUKS version 1.
|
||||
There are currently two versions of LUKS, 1 and 2. Version 2 is a bit easier to
|
||||
implement but not widely available in all Linux distributions supported by
|
||||
Ceph.
|
||||
|
||||
.. note:: Version 1 of LUKS is just referenced as "LUKS" whereas version 2 is
|
||||
referred to as LUKS2
|
||||
.. note:: Version 1 of LUKS is referred to in this documentation as "LUKS".
|
||||
Version 2 is of LUKS is referred to in this documentation as "LUKS2".
|
||||
|
||||
|
||||
LUKS on LVM
|
||||
-----------
|
||||
Encryption is done on top of existing logical volumes (unlike encrypting the
|
||||
physical device). Any single logical volume can be encrypted while other
|
||||
volumes can remain unencrypted. This method also allows for flexible logical
|
||||
Encryption is done on top of existing logical volumes (this is not the same as
|
||||
encrypting the physical device). Any single logical volume can be encrypted,
|
||||
leaving other volumes unencrypted. This method also allows for flexible logical
|
||||
volume setups, since encryption will happen once the LV is created.
|
||||
|
||||
|
||||
Workflow
|
||||
--------
|
||||
When setting up the OSD, a secret key will be created, that will be passed
|
||||
along to the monitor in JSON format as ``stdin`` to prevent the key from being
|
||||
When setting up the OSD, a secret key is created. That secret key is passed
|
||||
to the monitor in JSON format as ``stdin`` to prevent the key from being
|
||||
captured in the logs.
|
||||
|
||||
The JSON payload looks something like::
|
||||
The JSON payload looks something like this::
|
||||
|
||||
{
|
||||
"cephx_secret": CEPHX_SECRET,
|
||||
@ -51,36 +47,38 @@ The JSON payload looks something like::
|
||||
}
|
||||
|
||||
The naming convention for the keys is **strict**, and they are named like that
|
||||
for the hardcoded (legacy) names ceph-disk used.
|
||||
for the hardcoded (legacy) names used by ceph-disk.
|
||||
|
||||
* ``cephx_secret`` : The cephx key used to authenticate
|
||||
* ``dmcrypt_key`` : The secret (or private) key to unlock encrypted devices
|
||||
* ``cephx_lockbox_secret`` : The authentication key used to retrieve the
|
||||
``dmcrypt_key``. It is named *lockbox* because ceph-disk used to have an
|
||||
unencrypted partition named after it, used to store public keys and other
|
||||
OSD metadata.
|
||||
unencrypted partition named after it, which was used to store public keys and
|
||||
other OSD metadata.
|
||||
|
||||
The naming convention is strict because Monitors supported the naming
|
||||
convention by ceph-disk, which used these key names. In order to keep
|
||||
compatibility and prevent ceph-disk from breaking, ceph-volume will use the same
|
||||
naming convention *although they don't make sense for the new encryption
|
||||
convention of ceph-disk, which used these key names. In order to maintain
|
||||
compatibility and prevent ceph-disk from breaking, ceph-volume uses the same
|
||||
naming convention *although it does not make sense for the new encryption
|
||||
workflow*.
|
||||
|
||||
After the common steps of setting up the OSD during the prepare stage, either
|
||||
with :term:`filestore` or :term:`bluestore`, the logical volume is left ready
|
||||
to be activated, regardless of the state of the device (encrypted or decrypted).
|
||||
After the common steps of setting up the OSD during the "prepare stage" (either
|
||||
with :term:`filestore` or :term:`bluestore`), the logical volume is left ready
|
||||
to be activated, regardless of the state of the device (encrypted or
|
||||
decrypted).
|
||||
|
||||
At activation time, the logical volume will get decrypted and the OSD started
|
||||
once the process completes correctly.
|
||||
At the time of its activation, the logical volume is decrypted. The OSD starts
|
||||
after the process completes correctly.
|
||||
|
||||
Summary of the encryption workflow for creating a new OSD:
|
||||
Summary of the encryption workflow for creating a new OSD
|
||||
----------------------------------------------------------
|
||||
|
||||
#. OSD is created, both lockbox and dmcrypt keys are created, and sent along
|
||||
with JSON to the monitors, indicating an encrypted OSD.
|
||||
#. OSD is created. Both lockbox and dmcrypt keys are created and sent to the
|
||||
monitors in JSON format, indicating an encrypted OSD.
|
||||
|
||||
#. All complementary devices (like journal, db, or wal) get created and
|
||||
encrypted with the same OSD key. Key is stored in the LVM metadata of the
|
||||
OSD
|
||||
OSD.
|
||||
|
||||
#. Activation continues by ensuring devices are mounted, retrieving the dmcrypt
|
||||
secret key from the monitors and decrypting before the OSD gets started.
|
||||
secret key from the monitors, and decrypting before the OSD gets started.
|
||||
|
@ -2,25 +2,22 @@
|
||||
|
||||
``prepare``
|
||||
===========
|
||||
This subcommand allows a :term:`filestore` or :term:`bluestore` setup. It is
|
||||
recommended to pre-provision a logical volume before using it with
|
||||
``ceph-volume lvm``.
|
||||
Before you run ``ceph-volume lvm prepare``, we recommend that you provision a
|
||||
logical volume. Then you can run ``prepare`` on that logical volume.
|
||||
|
||||
Logical volumes are not altered except for adding extra metadata.
|
||||
``prepare`` adds metadata to logical volumes but does not alter them in any
|
||||
other way.
|
||||
|
||||
.. note:: This is part of a two step process to deploy an OSD. If looking for
|
||||
a single-call way, please see :ref:`ceph-volume-lvm-create`
|
||||
.. note:: This is part of a two-step process to deploy an OSD. If you prefer
|
||||
to deploy an OSD by using only one command, see :ref:`ceph-volume-lvm-create`.
|
||||
|
||||
To help identify volumes, the process of preparing a volume (or volumes) to
|
||||
work with Ceph, the tool will assign a few pieces of metadata information using
|
||||
:term:`LVM tags`.
|
||||
|
||||
:term:`LVM tags` makes volumes easy to discover later, and help identify them as
|
||||
part of a Ceph system, and what role they have (journal, filestore, bluestore,
|
||||
etc...)
|
||||
|
||||
Although :term:`bluestore` is the default, the back end can be specified with:
|
||||
``prepare`` uses :term:`LVM tags` to assign several pieces of metadata to a
|
||||
logical volume. Volumes tagged in this way are easier to identify and easier to
|
||||
use with Ceph. :term:`LVM tags` identify logical volumes by the role that they
|
||||
play in the Ceph cluster (for example: BlueStore data or BlueStore WAL+DB).
|
||||
|
||||
:term:`BlueStore<bluestore>` is the default backend. Ceph permits changing
|
||||
the backend, which can be done by using the following flags and arguments:
|
||||
|
||||
* :ref:`--filestore <ceph-volume-lvm-prepare_filestore>`
|
||||
* :ref:`--bluestore <ceph-volume-lvm-prepare_bluestore>`
|
||||
@ -29,50 +26,58 @@ Although :term:`bluestore` is the default, the back end can be specified with:
|
||||
|
||||
``bluestore``
|
||||
-------------
|
||||
The :term:`bluestore` objectstore is the default for new OSDs. It offers a bit
|
||||
more flexibility for devices compared to :term:`filestore`.
|
||||
Bluestore supports the following configurations:
|
||||
:term:`Bluestore<bluestore>` is the default backend for new OSDs. It
|
||||
offers more flexibility for devices than :term:`filestore` does. Bluestore
|
||||
supports the following configurations:
|
||||
|
||||
* A block device, a block.wal, and a block.db device
|
||||
* A block device and a block.wal device
|
||||
* A block device and a block.db device
|
||||
* A single block device
|
||||
* a block device, a block.wal device, and a block.db device
|
||||
* a block device and a block.wal device
|
||||
* a block device and a block.db device
|
||||
* a single block device
|
||||
|
||||
The bluestore subcommand accepts physical block devices, partitions on
|
||||
physical block devices or logical volumes as arguments for the various device parameters
|
||||
If a physical device is provided, a logical volume will be created. A volume group will
|
||||
either be created or reused it its name begins with ``ceph``.
|
||||
This allows a simpler approach at using LVM but at the cost of flexibility:
|
||||
there are no options or configurations to change how the LV is created.
|
||||
The ``bluestore`` subcommand accepts physical block devices, partitions on physical
|
||||
block devices, or logical volumes as arguments for the various device
|
||||
parameters. If a physical block device is provided, a logical volume will be
|
||||
created. If the provided volume group's name begins with `ceph`, it will be
|
||||
created if it does not yet exist and it will be clobbered and reused if it
|
||||
already exists. This allows for a simpler approach to using LVM but at the
|
||||
cost of flexibility: no option or configuration can be used to change how the
|
||||
logical volume is created.
|
||||
|
||||
The ``block`` is specified with the ``--data`` flag, and in its simplest use
|
||||
case it looks like::
|
||||
case it looks like:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
ceph-volume lvm prepare --bluestore --data vg/lv
|
||||
|
||||
A raw device can be specified in the same way::
|
||||
A raw device can be specified in the same way:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
ceph-volume lvm prepare --bluestore --data /path/to/device
|
||||
|
||||
For enabling :ref:`encryption <ceph-volume-lvm-encryption>`, the ``--dmcrypt`` flag is required::
|
||||
For enabling :ref:`encryption <ceph-volume-lvm-encryption>`, the ``--dmcrypt`` flag is required:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
ceph-volume lvm prepare --bluestore --dmcrypt --data vg/lv
|
||||
|
||||
If a ``block.db`` or a ``block.wal`` is needed (they are optional for
|
||||
bluestore) they can be specified with ``--block.db`` and ``--block.wal``
|
||||
accordingly. These can be a physical device, a partition or
|
||||
a logical volume.
|
||||
If a ``block.db`` device or a ``block.wal`` device is needed, it can be
|
||||
specified with ``--block.db`` or ``--block.wal``. These can be physical
|
||||
devices, partitions, or logical volumes. ``block.db`` and ``block.wal`` are
|
||||
optional for bluestore.
|
||||
|
||||
For both ``block.db`` and ``block.wal`` partitions aren't made logical volumes
|
||||
because they can be used as-is.
|
||||
For both ``block.db`` and ``block.wal``, partitions can be used as-is, and
|
||||
therefore are not made into logical volumes.
|
||||
|
||||
While creating the OSD directory, the process will use a ``tmpfs`` mount to
|
||||
place all the files needed for the OSD. These files are initially created by
|
||||
``ceph-osd --mkfs`` and are fully ephemeral.
|
||||
While creating the OSD directory, the process uses a ``tmpfs`` mount to hold
|
||||
the files needed for the OSD. These files are created by ``ceph-osd --mkfs``
|
||||
and are ephemeral.
|
||||
|
||||
A symlink is always created for the ``block`` device, and optionally for
|
||||
``block.db`` and ``block.wal``. For a cluster with a default name, and an OSD
|
||||
id of 0, the directory could look like::
|
||||
A symlink is created for the ``block`` device, and is optional for ``block.db``
|
||||
and ``block.wal``. For a cluster with a default name and an OSD ID of 0, the
|
||||
directory looks like this::
|
||||
|
||||
# ls -l /var/lib/ceph/osd/ceph-0
|
||||
lrwxrwxrwx. 1 ceph ceph 93 Oct 20 13:05 block -> /dev/ceph-be2b6fbd-bcf2-4c51-b35d-a35a162a02f0/osd-block-25cf0a05-2bc6-44ef-9137-79d65bd7ad62
|
||||
@ -85,11 +90,11 @@ id of 0, the directory could look like::
|
||||
-rw-------. 1 ceph ceph 10 Oct 20 13:05 type
|
||||
-rw-------. 1 ceph ceph 2 Oct 20 13:05 whoami
|
||||
|
||||
In the above case, a device was used for ``block`` so ``ceph-volume`` create
|
||||
a volume group and a logical volume using the following convention:
|
||||
In the above case, a device was used for ``block``, so ``ceph-volume`` created
|
||||
a volume group and a logical volume using the following conventions:
|
||||
|
||||
* volume group name: ``ceph-{cluster fsid}`` or if the vg exists already
|
||||
``ceph-{random uuid}``
|
||||
* volume group name: ``ceph-{cluster fsid}`` (or if the volume group already
|
||||
exists: ``ceph-{random uuid}``)
|
||||
|
||||
* logical volume name: ``osd-block-{osd_fsid}``
|
||||
|
||||
@ -98,78 +103,100 @@ a volume group and a logical volume using the following convention:
|
||||
|
||||
``filestore``
|
||||
-------------
|
||||
This is the OSD backend that allows preparation of logical volumes for
|
||||
a :term:`filestore` objectstore OSD.
|
||||
``Filestore<filestore>`` is the OSD backend that prepares logical volumes for a
|
||||
:term:`filestore`-backed object-store OSD.
|
||||
|
||||
It can use a logical volume for the OSD data and a physical device, a partition
|
||||
or logical volume for the journal. A physical device will have a logical volume
|
||||
created on it. A volume group will either be created or reused it its name begins
|
||||
with ``ceph``. No special preparation is needed for these volumes other than
|
||||
following the minimum size requirements for data and journal.
|
||||
|
||||
The CLI call looks like this of a basic standalone filestore OSD::
|
||||
``Filestore<filestore>`` uses a logical volume to store OSD data and it uses
|
||||
physical devices, partitions, or logical volumes to store the journal. If a
|
||||
physical device is used to create a filestore backend, a logical volume will be
|
||||
created on that physical device. If the provided volume group's name begins
|
||||
with `ceph`, it will be created if it does not yet exist and it will be
|
||||
clobbered and reused if it already exists. No special preparation is needed for
|
||||
these volumes, but be sure to meet the minimum size requirements for OSD data and
|
||||
for the journal.
|
||||
|
||||
Use the following command to create a basic filestore OSD:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
ceph-volume lvm prepare --filestore --data <data block device>
|
||||
|
||||
To deploy file store with an external journal::
|
||||
Use this command to deploy filestore with an external journal:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
ceph-volume lvm prepare --filestore --data <data block device> --journal <journal block device>
|
||||
|
||||
For enabling :ref:`encryption <ceph-volume-lvm-encryption>`, the ``--dmcrypt`` flag is required::
|
||||
Use this command to enable :ref:`encryption <ceph-volume-lvm-encryption>`, and note that the ``--dmcrypt`` flag is required:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
ceph-volume lvm prepare --filestore --dmcrypt --data <data block device> --journal <journal block device>
|
||||
|
||||
Both the journal and data block device can take three forms:
|
||||
The data block device and the journal can each take one of three forms:
|
||||
|
||||
* a physical block device
|
||||
* a partition on a physical block device
|
||||
* a logical volume
|
||||
|
||||
When using logical volumes the value *must* be of the format
|
||||
``volume_group/logical_volume``. Since logical volume names
|
||||
are not enforced for uniqueness, this prevents accidentally
|
||||
choosing the wrong volume.
|
||||
If you use a logical volume to deploy filestore, the value that you pass in the
|
||||
command *must* be of the format ``volume_group/logical_volume_name``. Since logical
|
||||
volume names are not enforced for uniqueness, using this format is an important
|
||||
safeguard against accidentally choosing the wrong volume (and clobbering its data).
|
||||
|
||||
When using a partition, it *must* contain a ``PARTUUID``, that can be
|
||||
discovered by ``blkid``. THis ensure it can later be identified correctly
|
||||
regardless of the device name (or path).
|
||||
If you use a partition to deploy filestore, the partition *must* contain a
|
||||
``PARTUUID`` that can be discovered by ``blkid``. This ensures that the
|
||||
partition can be identified correctly regardless of the device's name (or path).
|
||||
|
||||
For example: passing a logical volume for data and a partition ``/dev/sdc1`` for
|
||||
the journal::
|
||||
For example, to use a logical volume for OSD data and a partition
|
||||
(``/dev/sdc1``) for the journal, run a command of this form:
|
||||
|
||||
ceph-volume lvm prepare --filestore --data volume_group/lv_name --journal /dev/sdc1
|
||||
.. prompt:: bash #
|
||||
|
||||
Passing a bare device for data and a logical volume ias the journal::
|
||||
ceph-volume lvm prepare --filestore --data volume_group/logical_volume_name --journal /dev/sdc1
|
||||
|
||||
Or, to use a bare device for data and a logical volume for the journal:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
ceph-volume lvm prepare --filestore --data /dev/sdc --journal volume_group/journal_lv
|
||||
|
||||
A generated uuid is used to ask the cluster for a new OSD. These two pieces are
|
||||
crucial for identifying an OSD and will later be used throughout the
|
||||
:ref:`ceph-volume-lvm-activate` process.
|
||||
A generated UUID is used when asking the cluster for a new OSD. These two
|
||||
pieces of information (the OSD ID and the OSD UUID) are necessary for
|
||||
identifying a given OSD and will later be used throughout the
|
||||
:ref:`activation<ceph-volume-lvm-activate>` process.
|
||||
|
||||
The OSD data directory is created using the following convention::
|
||||
|
||||
/var/lib/ceph/osd/<cluster name>-<osd id>
|
||||
|
||||
At this point the data volume is mounted at this location, and the journal
|
||||
volume is linked::
|
||||
To link the journal volume to the mounted data volume, use this command:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
ln -s /path/to/journal /var/lib/ceph/osd/<cluster_name>-<osd-id>/journal
|
||||
|
||||
The monmap is fetched using the bootstrap key from the OSD::
|
||||
To fetch the monmap by using the bootstrap key from the OSD, use this command:
|
||||
|
||||
/usr/bin/ceph --cluster ceph --name client.bootstrap-osd
|
||||
--keyring /var/lib/ceph/bootstrap-osd/ceph.keyring
|
||||
mon getmap -o /var/lib/ceph/osd/<cluster name>-<osd id>/activate.monmap
|
||||
.. prompt:: bash #
|
||||
|
||||
``ceph-osd`` will be called to populate the OSD directory, that is already
|
||||
mounted, re-using all the pieces of information from the initial steps::
|
||||
/usr/bin/ceph --cluster ceph --name client.bootstrap-osd --keyring
|
||||
/var/lib/ceph/bootstrap-osd/ceph.keyring mon getmap -o
|
||||
/var/lib/ceph/osd/<cluster name>-<osd id>/activate.monmap
|
||||
|
||||
To populate the OSD directory (which has already been mounted), use this ``ceph-osd`` command:
|
||||
.. prompt:: bash #
|
||||
|
||||
ceph-osd --cluster ceph --mkfs --mkkey -i <osd id> \ --monmap
|
||||
/var/lib/ceph/osd/<cluster name>-<osd id>/activate.monmap --osd-data \
|
||||
/var/lib/ceph/osd/<cluster name>-<osd id> --osd-journal
|
||||
/var/lib/ceph/osd/<cluster name>-<osd id>/journal \ --osd-uuid <osd uuid>
|
||||
--keyring /var/lib/ceph/osd/<cluster name>-<osd id>/keyring \ --setuser ceph
|
||||
--setgroup ceph
|
||||
|
||||
All of the information from the previous steps is used in the above command.
|
||||
|
||||
ceph-osd --cluster ceph --mkfs --mkkey -i <osd id> \
|
||||
--monmap /var/lib/ceph/osd/<cluster name>-<osd id>/activate.monmap --osd-data \
|
||||
/var/lib/ceph/osd/<cluster name>-<osd id> --osd-journal /var/lib/ceph/osd/<cluster name>-<osd id>/journal \
|
||||
--osd-uuid <osd uuid> --keyring /var/lib/ceph/osd/<cluster name>-<osd id>/keyring \
|
||||
--setuser ceph --setgroup ceph
|
||||
|
||||
|
||||
.. _ceph-volume-lvm-partitions:
|
||||
|
@ -113,15 +113,15 @@ Adoption process
|
||||
ssh-copy-id -f -i ~/ceph.pub root@<host>
|
||||
|
||||
.. note::
|
||||
It is also possible to import an existing ssh key. See
|
||||
:ref:`ssh errors <cephadm-ssh-errors>` in the troubleshooting
|
||||
It is also possible to import an existing SSH key. See
|
||||
:ref:`SSH errors <cephadm-ssh-errors>` in the troubleshooting
|
||||
document for instructions that describe how to import existing
|
||||
ssh keys.
|
||||
SSH keys.
|
||||
|
||||
.. note::
|
||||
It is also possible to have cephadm use a non-root user to ssh
|
||||
It is also possible to have cephadm use a non-root user to SSH
|
||||
into cluster hosts. This user needs to have passwordless sudo access.
|
||||
Use ``ceph cephadm set-user <user>`` and copy the ssh key to that user.
|
||||
Use ``ceph cephadm set-user <user>`` and copy the SSH key to that user.
|
||||
See :ref:`cephadm-ssh-user`
|
||||
|
||||
#. Tell cephadm which hosts to manage:
|
||||
|
@ -8,27 +8,40 @@ Compatibility and Stability
|
||||
Compatibility with Podman Versions
|
||||
----------------------------------
|
||||
|
||||
Podman and Ceph have different end-of-life strategies that
|
||||
might make it challenging to find compatible Podman and Ceph
|
||||
versions
|
||||
Podman and Ceph have different end-of-life strategies. This means that care
|
||||
must be taken in finding a version of Podman that is compatible with Ceph.
|
||||
|
||||
Those versions are expected to work:
|
||||
This table shows which version pairs are expected to work or not work together:
|
||||
|
||||
|
||||
+-----------+---------------------------------------+
|
||||
+-----------+-----------------------------------------------+
|
||||
| Ceph | Podman |
|
||||
+-----------+-------+-------+-------+-------+-------+
|
||||
| | 1.9 | 2.0 | 2.1 | 2.2 | 3.0 |
|
||||
+===========+=======+=======+=======+=======+=======+
|
||||
| <= 15.2.5 | True | False | False | False | False |
|
||||
+-----------+-------+-------+-------+-------+-------+
|
||||
| >= 15.2.6 | True | True | True | False | False |
|
||||
+-----------+-------+-------+-------+-------+-------+
|
||||
| >= 16.2.1 | False | True | True | False | True |
|
||||
+-----------+-------+-------+-------+-------+-------+
|
||||
+-----------+-------+-------+-------+-------+-------+-------+
|
||||
| | 1.9 | 2.0 | 2.1 | 2.2 | 3.0 | > 3.0 |
|
||||
+===========+=======+=======+=======+=======+=======+=======+
|
||||
| <= 15.2.5 | True | False | False | False | False | False |
|
||||
+-----------+-------+-------+-------+-------+-------+-------+
|
||||
| >= 15.2.6 | True | True | True | False | False | False |
|
||||
+-----------+-------+-------+-------+-------+-------+-------+
|
||||
| >= 16.2.1 | False | True | True | False | True | True |
|
||||
+-----------+-------+-------+-------+-------+-------+-------+
|
||||
| >= 17.2.0 | False | True | True | False | True | True |
|
||||
+-----------+-------+-------+-------+-------+-------+-------+
|
||||
|
||||
.. note::
|
||||
|
||||
While not all podman versions have been actively tested against
|
||||
all Ceph versions, there are no known issues with using podman
|
||||
version 3.0 or greater with Ceph Quincy and later releases.
|
||||
|
||||
.. warning::
|
||||
Only podman versions that are 2.0.0 and higher work with Ceph Pacific, with the exception of podman version 2.2.1, which does not work with Ceph Pacific. kubic stable is known to work with Ceph Pacific, but it must be run with a newer kernel.
|
||||
|
||||
To use Podman with Ceph Pacific, you must use **a version of Podman that
|
||||
is 2.0.0 or higher**. However, **Podman version 2.2.1 does not work with
|
||||
Ceph Pacific**.
|
||||
|
||||
"Kubic stable" is known to work with Ceph Pacific, but it must be run
|
||||
with a newer kernel.
|
||||
|
||||
|
||||
.. _cephadm-stability:
|
||||
@ -36,19 +49,18 @@ Those versions are expected to work:
|
||||
Stability
|
||||
---------
|
||||
|
||||
Cephadm is actively in development. Please be aware that some
|
||||
functionality is still rough around the edges. Especially the
|
||||
following components are working with cephadm, but the
|
||||
documentation is not as complete as we would like, and there may be some
|
||||
changes in the near future:
|
||||
Cephadm is relatively stable but new functionality is still being
|
||||
added and bugs are occasionally discovered. If issues are found, please
|
||||
open a tracker issue under the Orchestrator component (https://tracker.ceph.com/projects/orchestrator/issues)
|
||||
|
||||
- RGW
|
||||
Cephadm support remains under development for the following features:
|
||||
|
||||
Cephadm support for the following features is still under development and may see breaking
|
||||
changes in future releases:
|
||||
- ceph-exporter deployment
|
||||
- stretch mode integration
|
||||
- monitoring stack (moving towards prometheus service discover and providing TLS)
|
||||
- RGW multisite deployment support (requires lots of manual steps currently)
|
||||
- cephadm agent
|
||||
|
||||
- Ingress
|
||||
- Cephadm exporter daemon
|
||||
- cephfs-mirror
|
||||
|
||||
In case you encounter issues, see also :ref:`cephadm-pause`.
|
||||
If a cephadm command fails or a service stops running properly, see
|
||||
:ref:`cephadm-pause` for instructions on how to pause the Ceph cluster's
|
||||
background activity and how to disable cephadm.
|
||||
|
@ -4,17 +4,26 @@
|
||||
Host Management
|
||||
===============
|
||||
|
||||
To list hosts associated with the cluster:
|
||||
Listing Hosts
|
||||
=============
|
||||
|
||||
Run a command of this form to list hosts associated with the cluster:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
ceph orch host ls [--format yaml] [--host-pattern <name>] [--label <label>] [--host-status <status>]
|
||||
|
||||
where the optional arguments "host-pattern", "label" and "host-status" are used for filtering.
|
||||
"host-pattern" is a regex that will match against hostnames and will only return matching hosts
|
||||
"label" will only return hosts with the given label
|
||||
"host-status" will only return hosts with the given status (currently "offline" or "maintenance")
|
||||
Any combination of these filtering flags is valid. You may filter against name, label and/or status simultaneously
|
||||
In commands of this form, the arguments "host-pattern", "label" and
|
||||
"host-status" are optional and are used for filtering.
|
||||
|
||||
- "host-pattern" is a regex that matches against hostnames and returns only
|
||||
matching hosts.
|
||||
- "label" returns only hosts with the specified label.
|
||||
- "host-status" returns only hosts with the specified status (currently
|
||||
"offline" or "maintenance").
|
||||
- Any combination of these filtering flags is valid. It is possible to filter
|
||||
against name, label and status simultaneously, or to filter against any
|
||||
proper subset of name, label and status.
|
||||
|
||||
.. _cephadm-adding-hosts:
|
||||
|
||||
@ -70,31 +79,35 @@ To add each new host to the cluster, perform two steps:
|
||||
Removing Hosts
|
||||
==============
|
||||
|
||||
A host can safely be removed from a the cluster once all daemons are removed from it.
|
||||
A host can safely be removed from the cluster after all daemons are removed
|
||||
from it.
|
||||
|
||||
To drain all daemons from a host do the following:
|
||||
To drain all daemons from a host, run a command of the following form:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
ceph orch host drain *<host>*
|
||||
|
||||
The '_no_schedule' label will be applied to the host. See :ref:`cephadm-special-host-labels`
|
||||
The ``_no_schedule`` label will be applied to the host. See
|
||||
:ref:`cephadm-special-host-labels`.
|
||||
|
||||
All osds on the host will be scheduled to be removed. You can check osd removal progress with the following:
|
||||
All OSDs on the host will be scheduled to be removed. You can check the progress of the OSD removal operation with the following command:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
ceph orch osd rm status
|
||||
|
||||
see :ref:`cephadm-osd-removal` for more details about osd removal
|
||||
See :ref:`cephadm-osd-removal` for more details about OSD removal.
|
||||
|
||||
You can check if there are no daemons left on the host with the following:
|
||||
Use the following command to determine whether any daemons are still on the
|
||||
host:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
ceph orch ps <host>
|
||||
|
||||
Once all daemons are removed you can remove the host with the following:
|
||||
After all daemons have been removed from the host, remove the host from the
|
||||
cluster by running the following command:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
@ -103,14 +116,16 @@ Once all daemons are removed you can remove the host with the following:
|
||||
Offline host removal
|
||||
--------------------
|
||||
|
||||
If a host is offline and can not be recovered it can still be removed from the cluster with the following:
|
||||
Even if a host is offline and can not be recovered, it can be removed from the
|
||||
cluster by running a command of the following form:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
ceph orch host rm <host> --offline --force
|
||||
|
||||
This can potentially cause data loss as osds will be forcefully purged from the cluster by calling ``osd purge-actual`` for each osd.
|
||||
Service specs that still contain this host should be manually updated.
|
||||
.. warning:: This can potentially cause data loss. This command forcefully
|
||||
purges OSDs from the cluster by calling ``osd purge-actual`` for each OSD.
|
||||
Any service specs that still contain this host should be manually updated.
|
||||
|
||||
.. _orchestrator-host-labels:
|
||||
|
||||
@ -122,16 +137,22 @@ are free form and have no particular meaning by itself and each host
|
||||
can have multiple labels. They can be used to specify placement
|
||||
of daemons. See :ref:`orch-placement-by-labels`
|
||||
|
||||
Labels can be added when adding a host with the ``--labels`` flag::
|
||||
Labels can be added when adding a host with the ``--labels`` flag:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
ceph orch host add my_hostname --labels=my_label1
|
||||
ceph orch host add my_hostname --labels=my_label1,my_label2
|
||||
|
||||
To add a label a existing host, run::
|
||||
To add a label a existing host, run:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
ceph orch host label add my_hostname my_label
|
||||
|
||||
To remove a label, run::
|
||||
To remove a label, run:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
ceph orch host label rm my_hostname my_label
|
||||
|
||||
@ -168,7 +189,9 @@ The following host labels have a special meaning to cephadm. All start with ``_
|
||||
Maintenance Mode
|
||||
================
|
||||
|
||||
Place a host in and out of maintenance mode (stops all Ceph daemons on host)::
|
||||
Place a host in and out of maintenance mode (stops all Ceph daemons on host):
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
ceph orch host maintenance enter <hostname> [--force]
|
||||
ceph orch host maintenance exit <hostname>
|
||||
@ -182,14 +205,21 @@ Rescanning Host Devices
|
||||
|
||||
Some servers and external enclosures may not register device removal or insertion with the
|
||||
kernel. In these scenarios, you'll need to perform a host rescan. A rescan is typically
|
||||
non-disruptive, and can be performed with the following CLI command.::
|
||||
non-disruptive, and can be performed with the following CLI command:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
ceph orch host rescan <hostname> [--with-summary]
|
||||
|
||||
The ``with-summary`` flag provides a breakdown of the number of HBAs found and scanned, together
|
||||
with any that failed.::
|
||||
with any that failed:
|
||||
|
||||
.. prompt:: bash [ceph:root@rh9-ceph1/]#
|
||||
|
||||
ceph orch host rescan rh9-ceph1 --with-summary
|
||||
|
||||
::
|
||||
|
||||
[ceph: root@rh9-ceph1 /]# ceph orch host rescan rh9-ceph1 --with-summary
|
||||
Ok. 2 adapters detected: 2 rescanned, 0 skipped, 0 failed (0.32s)
|
||||
|
||||
Creating many hosts at once
|
||||
@ -217,9 +247,10 @@ Many hosts can be added at once using
|
||||
hostname: node-02
|
||||
addr: 192.168.0.12
|
||||
|
||||
This can be combined with service specifications (below) to create a cluster spec
|
||||
file to deploy a whole cluster in one command. see ``cephadm bootstrap --apply-spec``
|
||||
also to do this during bootstrap. Cluster SSH Keys must be copied to hosts prior to adding them.
|
||||
This can be combined with :ref:`service specifications<orchestrator-cli-service-spec>`
|
||||
to create a cluster spec file to deploy a whole cluster in one command. see
|
||||
``cephadm bootstrap --apply-spec`` also to do this during bootstrap. Cluster
|
||||
SSH Keys must be copied to hosts prior to adding them.
|
||||
|
||||
Setting the initial CRUSH location of host
|
||||
==========================================
|
||||
@ -246,8 +277,10 @@ See also :ref:`crush_map_default_types`.
|
||||
OS Tuning Profiles
|
||||
==================
|
||||
|
||||
Cephadm can manage operating system tuning profiles that apply a set of sysctl settings
|
||||
to a given set of hosts. First create a YAML spec file in the following format
|
||||
Cephadm can be used to manage operating-system-tuning profiles that apply sets
|
||||
of sysctl settings to sets of hosts.
|
||||
|
||||
Create a YAML spec file in the following format:
|
||||
|
||||
.. code-block:: yaml
|
||||
|
||||
@ -260,77 +293,90 @@ to a given set of hosts. First create a YAML spec file in the following format
|
||||
fs.file-max: 1000000
|
||||
vm.swappiness: '13'
|
||||
|
||||
Then apply the tuning profile with::
|
||||
Apply the tuning profile with the following command:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
ceph orch tuned-profile apply -i <tuned-profile-file-name>
|
||||
|
||||
This profile will then be written to ``/etc/sysctl.d/`` on each host matching the
|
||||
given placement and `sysctl --system` will be run on the host.
|
||||
This profile is written to ``/etc/sysctl.d/`` on each host that matches the
|
||||
hosts specified in the placement block of the yaml, and ``sysctl --system`` is
|
||||
run on the host.
|
||||
|
||||
.. note::
|
||||
|
||||
The exact filename the profile will be written to is within ``/etc/sysctl.d/`` is
|
||||
``<profile-name>-cephadm-tuned-profile.conf`` where <profile-name>
|
||||
is the `profile_name` setting specified in the provided YAML spec. Since sysctl
|
||||
settings are applied in lexicographical order by the filename the setting is
|
||||
specified in, you may want to set the `profile_name` in your spec so
|
||||
that it is applied before or after other conf files that may exist.
|
||||
The exact filename that the profile is written to within ``/etc/sysctl.d/``
|
||||
is ``<profile-name>-cephadm-tuned-profile.conf``, where ``<profile-name>`` is
|
||||
the ``profile_name`` setting that you specify in the YAML spec. Because
|
||||
sysctl settings are applied in lexicographical order (sorted by the filename
|
||||
in which the setting is specified), you may want to set the ``profile_name``
|
||||
in your spec so that it is applied before or after other conf files.
|
||||
|
||||
.. note::
|
||||
|
||||
These settings are applied only at the host level, and are not specific
|
||||
to any certain daemon or container
|
||||
to any particular daemon or container.
|
||||
|
||||
.. note::
|
||||
|
||||
Applying tuned profiles is idempotent when the ``--no-overwrite`` option is passed.
|
||||
In this case existing profiles with the same name are not overwritten.
|
||||
Applying tuned profiles is idempotent when the ``--no-overwrite`` option is
|
||||
passed. Moreover, if the ``--no-overwrite`` option is passed, existing
|
||||
profiles with the same name are not overwritten.
|
||||
|
||||
|
||||
Viewing Profiles
|
||||
----------------
|
||||
|
||||
To view all current profiles cephadm is managing::
|
||||
Run the following command to view all the profiles that cephadm currently manages:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
ceph orch tuned-profile ls
|
||||
|
||||
.. note::
|
||||
|
||||
If you'd like to make modifications and re-apply a profile passing `--format yaml` to the
|
||||
``tuned-profile ls`` command will present the profiles in a format where they can be copied
|
||||
and re-applied.
|
||||
To make modifications and re-apply a profile, pass ``--format yaml`` to the
|
||||
``tuned-profile ls`` command. The ``tuned-profile ls --format yaml`` command
|
||||
presents the profiles in a format that is easy to copy and re-apply.
|
||||
|
||||
|
||||
Removing Profiles
|
||||
-----------------
|
||||
|
||||
If you no longer want one of the previously applied profiles, it can be removed with::
|
||||
To remove a previously applied profile, run this command:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
ceph orch tuned-profile rm <profile-name>
|
||||
|
||||
When a profile is removed, cephadm will clean up the file previously written to /etc/sysctl.d
|
||||
When a profile is removed, cephadm cleans up the file previously written to ``/etc/sysctl.d``.
|
||||
|
||||
|
||||
Modifying Profiles
|
||||
------------------
|
||||
|
||||
While you can modify a profile by simply re-applying a YAML spec with the same profile name,
|
||||
you may also want to adjust a setting within a given profile, so there are commands
|
||||
for this purpose.
|
||||
Profiles can be modified by re-applying a YAML spec with the same name as the
|
||||
profile that you want to modify, but settings within existing profiles can be
|
||||
adjusted with the following commands.
|
||||
|
||||
To add or modify a setting for an existing profile::
|
||||
To add or modify a setting in an existing profile:
|
||||
|
||||
ceph orch tuned-profile add-setting <setting-name> <value>
|
||||
.. prompt:: bash #
|
||||
|
||||
To remove a setting from an existing profile::
|
||||
ceph orch tuned-profile add-setting <profile-name> <setting-name> <value>
|
||||
|
||||
ceph orch tuned-profile rm-setting <setting-name>
|
||||
To remove a setting from an existing profile:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
ceph orch tuned-profile rm-setting <profile-name> <setting-name>
|
||||
|
||||
.. note::
|
||||
|
||||
Modifying the placement will require re-applying a profile with the same name. Keep
|
||||
in mind that profiles are tracked by their name, so whenever a profile with the same
|
||||
name as an existing profile is applied, it will overwrite the old profile.
|
||||
Modifying the placement requires re-applying a profile with the same name.
|
||||
Remember that profiles are tracked by their names, so when a profile with the
|
||||
same name as an existing profile is applied, it overwrites the old profile
|
||||
unless the ``--no-overwrite`` flag is passed.
|
||||
|
||||
SSH Configuration
|
||||
=================
|
||||
@ -347,24 +393,34 @@ connect to remote hosts. When the cluster is bootstrapped, this SSH
|
||||
key is generated automatically and no additional configuration
|
||||
is necessary.
|
||||
|
||||
A *new* SSH key can be generated with::
|
||||
A *new* SSH key can be generated with:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
ceph cephadm generate-key
|
||||
|
||||
The public portion of the SSH key can be retrieved with::
|
||||
The public portion of the SSH key can be retrieved with:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
ceph cephadm get-pub-key
|
||||
|
||||
The currently stored SSH key can be deleted with::
|
||||
The currently stored SSH key can be deleted with:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
ceph cephadm clear-key
|
||||
|
||||
You can make use of an existing key by directly importing it with::
|
||||
You can make use of an existing key by directly importing it with:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
ceph config-key set mgr/cephadm/ssh_identity_key -i <key>
|
||||
ceph config-key set mgr/cephadm/ssh_identity_pub -i <pub>
|
||||
|
||||
You will then need to restart the mgr daemon to reload the configuration with::
|
||||
You will then need to restart the mgr daemon to reload the configuration with:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
ceph mgr fail
|
||||
|
||||
@ -378,11 +434,13 @@ that has enough privileges to download container images, start containers
|
||||
and execute commands without prompting for a password. If you do not want
|
||||
to use the "root" user (default option in cephadm), you must provide
|
||||
cephadm the name of the user that is going to be used to perform all the
|
||||
cephadm operations. Use the command::
|
||||
cephadm operations. Use the command:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
ceph cephadm set-user <user>
|
||||
|
||||
Prior to running this the cluster ssh key needs to be added to this users
|
||||
Prior to running this the cluster SSH key needs to be added to this users
|
||||
authorized_keys file and non-root users must have passwordless sudo access.
|
||||
|
||||
|
||||
@ -401,15 +459,21 @@ something like this::
|
||||
There are two ways to customize this configuration for your environment:
|
||||
|
||||
#. Import a customized configuration file that will be stored
|
||||
by the monitor with::
|
||||
by the monitor with:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
ceph cephadm set-ssh-config -i <ssh_config_file>
|
||||
|
||||
To remove a customized SSH config and revert back to the default behavior::
|
||||
To remove a customized SSH config and revert back to the default behavior:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
ceph cephadm clear-ssh-config
|
||||
|
||||
#. You can configure a file location for the SSH configuration file with::
|
||||
#. You can configure a file location for the SSH configuration file with:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
ceph config set mgr mgr/cephadm/ssh_config_file <path>
|
||||
|
||||
@ -476,4 +540,4 @@ requires the bare host name when adding a host to the cluster:
|
||||
|
||||
..
|
||||
TODO: This chapter needs to provide way for users to configure
|
||||
Grafana in the dashboard, as this is right no very hard to do.
|
||||
Grafana in the dashboard, as this is right now very hard to do.
|
||||
|
@ -4,21 +4,36 @@
|
||||
Cephadm
|
||||
=======
|
||||
|
||||
``cephadm`` deploys and manages a Ceph cluster. It does this by connecting the
|
||||
manager daemon to hosts via SSH. The manager daemon is able to add, remove, and
|
||||
update Ceph containers. ``cephadm`` does not rely on external configuration
|
||||
tools such as Ansible, Rook, and Salt.
|
||||
``cephadm`` is a utility that is used to manage a Ceph cluster.
|
||||
|
||||
``cephadm`` manages the full lifecycle of a Ceph cluster. This lifecycle
|
||||
starts with the bootstrapping process, when ``cephadm`` creates a tiny
|
||||
Ceph cluster on a single node. This cluster consists of one monitor and
|
||||
one manager. ``cephadm`` then uses the orchestration interface ("day 2"
|
||||
commands) to expand the cluster, adding all hosts and provisioning all
|
||||
Ceph daemons and services. Management of this lifecycle can be performed
|
||||
either via the Ceph command-line interface (CLI) or via the dashboard (GUI).
|
||||
Here is a list of some of the things that ``cephadm`` can do:
|
||||
|
||||
``cephadm`` is new in Ceph release v15.2.0 (Octopus) and does not support older
|
||||
versions of Ceph.
|
||||
- ``cephadm`` can add a Ceph container to the cluster.
|
||||
- ``cephadm`` can remove a Ceph container from the cluster.
|
||||
- ``cephadm`` can update Ceph containers.
|
||||
|
||||
``cephadm`` does not rely on external configuration tools like Ansible, Rook,
|
||||
or Salt. However, those external configuration tools can be used to automate
|
||||
operations not performed by cephadm itself. To learn more about these external
|
||||
configuration tools, visit their pages:
|
||||
|
||||
* https://github.com/ceph/cephadm-ansible
|
||||
* https://rook.io/docs/rook/v1.10/Getting-Started/intro/
|
||||
* https://github.com/ceph/ceph-salt
|
||||
|
||||
``cephadm`` manages the full lifecycle of a Ceph cluster. This lifecycle starts
|
||||
with the bootstrapping process, when ``cephadm`` creates a tiny Ceph cluster on
|
||||
a single node. This cluster consists of one monitor and one manager.
|
||||
``cephadm`` then uses the orchestration interface to expand the cluster, adding
|
||||
hosts and provisioning Ceph daemons and services. Management of this lifecycle
|
||||
can be performed either via the Ceph command-line interface (CLI) or via the
|
||||
dashboard (GUI).
|
||||
|
||||
To use ``cephadm`` to get started with Ceph, follow the instructions in
|
||||
:ref:`cephadm_deploying_new_cluster`.
|
||||
|
||||
``cephadm`` was introduced in Ceph release v15.2.0 (Octopus) and does not
|
||||
support older versions of Ceph.
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
|
@ -1,3 +1,5 @@
|
||||
.. _cephadm_deploying_new_cluster:
|
||||
|
||||
============================
|
||||
Deploying a new Ceph cluster
|
||||
============================
|
||||
@ -8,7 +10,6 @@ then deploying the needed services.
|
||||
|
||||
.. highlight:: console
|
||||
|
||||
|
||||
.. _cephadm-host-requirements:
|
||||
|
||||
Requirements
|
||||
@ -35,17 +36,12 @@ Ceph.
|
||||
Install cephadm
|
||||
===============
|
||||
|
||||
The ``cephadm`` command can
|
||||
|
||||
#. bootstrap a new cluster
|
||||
#. launch a containerized shell with a working Ceph CLI
|
||||
#. aid in debugging containerized Ceph daemons
|
||||
|
||||
There are two ways to install ``cephadm``:
|
||||
|
||||
#. a :ref:`curl-based installation<cephadm_install_curl>` method
|
||||
#. :ref:`distribution-specific installation methods<cephadm_install_distros>`
|
||||
|
||||
|
||||
.. _cephadm_install_curl:
|
||||
|
||||
curl-based installation
|
||||
@ -216,8 +212,8 @@ available options.
|
||||
EOF
|
||||
$ ./cephadm bootstrap --config initial-ceph.conf ...
|
||||
|
||||
* The ``--ssh-user *<user>*`` option makes it possible to choose which ssh
|
||||
user cephadm will use to connect to hosts. The associated ssh key will be
|
||||
* The ``--ssh-user *<user>*`` option makes it possible to choose which SSH
|
||||
user cephadm will use to connect to hosts. The associated SSH key will be
|
||||
added to ``/home/*<user>*/.ssh/authorized_keys``. The user that you
|
||||
designate with this option must have passwordless sudo access.
|
||||
|
||||
@ -298,7 +294,7 @@ By default, a ``ceph.conf`` file and a copy of the ``client.admin`` keyring
|
||||
are maintained in ``/etc/ceph`` on all hosts with the ``_admin`` label, which is initially
|
||||
applied only to the bootstrap host. We usually recommend that one or more other hosts be
|
||||
given the ``_admin`` label so that the Ceph CLI (e.g., via ``cephadm shell``) is easily
|
||||
accessible on multiple hosts. To add the ``_admin`` label to additional host(s),
|
||||
accessible on multiple hosts. To add the ``_admin`` label to additional host(s):
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
@ -316,8 +312,8 @@ Please follow :ref:`deploy_additional_monitors` to deploy additional MONs.
|
||||
Adding Storage
|
||||
==============
|
||||
|
||||
To add storage to the cluster, either tell Ceph to consume any
|
||||
available and unused device:
|
||||
To add storage to the cluster, you can tell Ceph to consume any
|
||||
available and unused device(s):
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
@ -368,7 +364,9 @@ Different deployment scenarios
|
||||
Single host
|
||||
-----------
|
||||
|
||||
To configure a Ceph cluster to run on a single host, use the ``--single-host-defaults`` flag when bootstrapping. For use cases of this, see :ref:`one-node-cluster`.
|
||||
To configure a Ceph cluster to run on a single host, use the
|
||||
``--single-host-defaults`` flag when bootstrapping. For use cases of this, see
|
||||
:ref:`one-node-cluster`.
|
||||
|
||||
The ``--single-host-defaults`` flag sets the following configuration options::
|
||||
|
||||
@ -376,30 +374,68 @@ The ``--single-host-defaults`` flag sets the following configuration options::
|
||||
global/osd_pool_default_size = 2
|
||||
mgr/mgr_standby_modules = False
|
||||
|
||||
For more information on these options, see :ref:`one-node-cluster` and ``mgr_standby_modules`` in :ref:`mgr-administrator-guide`.
|
||||
For more information on these options, see :ref:`one-node-cluster` and
|
||||
``mgr_standby_modules`` in :ref:`mgr-administrator-guide`.
|
||||
|
||||
.. _cephadm-airgap:
|
||||
|
||||
Deployment in an isolated environment
|
||||
-------------------------------------
|
||||
|
||||
You can install Cephadm in an isolated environment by using a custom container registry. You can either configure Podman or Docker to use an insecure registry, or make the registry secure. Ensure your container image is inside the registry and that you have access to all hosts you wish to add to the cluster.
|
||||
You might need to install cephadm in an environment that is not connected
|
||||
directly to the internet (such an environment is also called an "isolated
|
||||
environment"). This can be done if a custom container registry is used. Either
|
||||
of two kinds of custom container registry can be used in this scenario: (1) a
|
||||
Podman-based or Docker-based insecure registry, or (2) a secure registry.
|
||||
|
||||
Run a local container registry:
|
||||
The practice of installing software on systems that are not connected directly
|
||||
to the internet is called "airgapping" and registries that are not connected
|
||||
directly to the internet are referred to as "airgapped".
|
||||
|
||||
Make sure that your container image is inside the registry. Make sure that you
|
||||
have access to all hosts that you plan to add to the cluster.
|
||||
|
||||
#. Run a local container registry:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
podman run --privileged -d --name registry -p 5000:5000 -v /var/lib/registry:/var/lib/registry --restart=always registry:2
|
||||
|
||||
If you are using an insecure registry, configure Podman or Docker with the hostname and port where the registry is running.
|
||||
#. If you are using an insecure registry, configure Podman or Docker with the
|
||||
hostname and port where the registry is running.
|
||||
|
||||
.. note:: For every host which accesses the local insecure registry, you will need to repeat this step on the host.
|
||||
.. note:: You must repeat this step for every host that accesses the local
|
||||
insecure registry.
|
||||
|
||||
Next, push your container image to your local registry.
|
||||
#. Push your container image to your local registry. Here are some acceptable
|
||||
kinds of container images:
|
||||
|
||||
Then run bootstrap using the ``--image`` flag with your container image. For example:
|
||||
* Ceph container image. See :ref:`containers`.
|
||||
* Prometheus container image
|
||||
* Node exporter container image
|
||||
* Grafana container image
|
||||
* Alertmanager container image
|
||||
|
||||
#. Create a temporary configuration file to store the names of the monitoring
|
||||
images. (See :ref:`cephadm_monitoring-images`):
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
cat <<EOF > initial-ceph.conf
|
||||
|
||||
::
|
||||
|
||||
[mgr]
|
||||
mgr/cephadm/container_image_prometheus *<hostname>*:5000/prometheus
|
||||
mgr/cephadm/container_image_node_exporter *<hostname>*:5000/node_exporter
|
||||
mgr/cephadm/container_image_grafana *<hostname>*:5000/grafana
|
||||
mgr/cephadm/container_image_alertmanager *<hostname>*:5000/alertmanger
|
||||
|
||||
#. Run bootstrap using the ``--image`` flag and pass the name of your
|
||||
container image as the argument of the image flag. For example:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
cephadm --image *<hostname>*:5000/ceph/ceph bootstrap --mon-ip *<mon-ip>*
|
||||
|
||||
|
||||
.. _cluster network: ../rados/configuration/network-config-ref#cluster-network
|
||||
|
@ -40,6 +40,68 @@ monitor hosts as well as to the monitor daemons' stderr.
|
||||
|
||||
.. _cephadm-logs:
|
||||
|
||||
|
||||
Ceph daemon control
|
||||
===================
|
||||
|
||||
Starting and stopping daemons
|
||||
-----------------------------
|
||||
|
||||
You can stop, start, or restart a daemon with:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
ceph orch daemon stop <name>
|
||||
ceph orch daemon start <name>
|
||||
ceph orch daemon restart <name>
|
||||
|
||||
You can also do the same for all daemons for a service with:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
ceph orch stop <name>
|
||||
ceph orch start <name>
|
||||
ceph orch restart <name>
|
||||
|
||||
|
||||
Redeploying or reconfiguring a daemon
|
||||
-------------------------------------
|
||||
|
||||
The container for a daemon can be stopped, recreated, and restarted with
|
||||
the ``redeploy`` command:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
ceph orch daemon redeploy <name> [--image <image>]
|
||||
|
||||
A container image name can optionally be provided to force a
|
||||
particular image to be used (instead of the image specified by the
|
||||
``container_image`` config value).
|
||||
|
||||
If only the ceph configuration needs to be regenerated, you can also
|
||||
issue a ``reconfig`` command, which will rewrite the ``ceph.conf``
|
||||
file but will not trigger a restart of the daemon.
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
ceph orch daemon reconfig <name>
|
||||
|
||||
|
||||
Rotating a daemon's authenticate key
|
||||
------------------------------------
|
||||
|
||||
All Ceph and gateway daemons in the cluster have a secret key that is used to connect
|
||||
to and authenticate with the cluster. This key can be rotated (i.e., replaced with a
|
||||
new key) with the following command:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
ceph orch daemon rotate-key <name>
|
||||
|
||||
For MDS, OSD, and MGR daemons, this does not require a daemon restart. For other
|
||||
daemons, however (e.g., RGW), the daemon may be restarted to switch to the new key.
|
||||
|
||||
|
||||
Ceph daemon logs
|
||||
================
|
||||
|
||||
|
@ -496,11 +496,20 @@ candidate hosts.
|
||||
If there are fewer hosts selected by the placement specification than
|
||||
demanded by ``count``, cephadm will deploy only on the selected hosts.
|
||||
|
||||
.. _cephadm-extra-container-args:
|
||||
|
||||
Extra Container Arguments
|
||||
=========================
|
||||
|
||||
.. warning::
|
||||
The arguments provided for extra container args are limited to whatever arguments are available for a `run` command from whichever container engine you are using. Providing any arguments the `run` command does not support (or invalid values for arguments) will cause the daemon to fail to start.
|
||||
The arguments provided for extra container args are limited to whatever arguments are available for
|
||||
a `run` command from whichever container engine you are using. Providing any arguments the `run`
|
||||
command does not support (or invalid values for arguments) will cause the daemon to fail to start.
|
||||
|
||||
.. note::
|
||||
|
||||
For arguments passed to the process running inside the container rather than the for
|
||||
the container runtime itself, see :ref:`cephadm-extra-entrypoint-args`
|
||||
|
||||
|
||||
Cephadm supports providing extra miscellaneous container arguments for
|
||||
@ -522,6 +531,52 @@ a spec like
|
||||
|
||||
which would cause each mon daemon to be deployed with `--cpus=2`.
|
||||
|
||||
Mounting Files with Extra Container Arguments
|
||||
---------------------------------------------
|
||||
|
||||
A common use case for extra container arguments is to mount additional
|
||||
files within the container. However, some intuitive formats for doing
|
||||
so can cause deployment to fail (see https://tracker.ceph.com/issues/57338).
|
||||
The recommended syntax for mounting a file with extra container arguments is:
|
||||
|
||||
.. code-block:: yaml
|
||||
|
||||
extra_container_args:
|
||||
- "-v"
|
||||
- "/absolute/file/path/on/host:/absolute/file/path/in/container"
|
||||
|
||||
For example:
|
||||
|
||||
.. code-block:: yaml
|
||||
|
||||
extra_container_args:
|
||||
- "-v"
|
||||
- "/opt/ceph_cert/host.cert:/etc/grafana/certs/cert_file:ro"
|
||||
|
||||
.. _cephadm-extra-entrypoint-args:
|
||||
|
||||
Extra Entrypoint Arguments
|
||||
==========================
|
||||
|
||||
.. note::
|
||||
|
||||
For arguments intended for the container runtime rather than the process inside
|
||||
it, see :ref:`cephadm-extra-container-args`
|
||||
|
||||
Similar to extra container args for the container runtime, Cephadm supports
|
||||
appending to args passed to the entrypoint process running
|
||||
within a container. For example, to set the collector textfile directory for
|
||||
the node-exporter service , one could apply a service spec like
|
||||
|
||||
.. code-block:: yaml
|
||||
|
||||
service_type: node-exporter
|
||||
service_name: node-exporter
|
||||
placement:
|
||||
host_pattern: '*'
|
||||
extra_entrypoint_args:
|
||||
- "--collector.textfile.directory=/var/lib/node_exporter/textfile_collector2"
|
||||
|
||||
Custom Config Files
|
||||
===================
|
||||
|
||||
|
@ -125,6 +125,8 @@ example spec file:
|
||||
spec:
|
||||
port: 4200
|
||||
|
||||
.. _cephadm_monitoring-images:
|
||||
|
||||
Using custom images
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
@ -183,6 +185,8 @@ For example, if you had changed the prometheus image
|
||||
|
||||
ceph config rm mgr mgr/cephadm/container_image_prometheus
|
||||
|
||||
See also :ref:`cephadm-airgap`.
|
||||
|
||||
.. _cephadm-overwrite-jinja2-templates:
|
||||
|
||||
Using custom configuration files
|
||||
@ -314,6 +318,36 @@ Due to performance reasons, monitoring of RBD images is disabled by default. For
|
||||
:ref:`prometheus-rbd-io-statistics`. If disabled, the overview and details dashboards will stay empty in Grafana
|
||||
and the metrics will not be visible in Prometheus.
|
||||
|
||||
Setting up Prometheus
|
||||
-----------------------
|
||||
|
||||
Setting Prometheus Retention Size and Time
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Cephadm can configure Prometheus TSDB retention by specifying ``retention_time``
|
||||
and ``retention_size`` values in the Prometheus service spec.
|
||||
The retention time value defaults to 15 days (15d). Users can set a different value/unit where
|
||||
supported units are: 'y', 'w', 'd', 'h', 'm' and 's'. The retention size value defaults
|
||||
to 0 (disabled). Supported units in this case are: 'B', 'KB', 'MB', 'GB', 'TB', 'PB' and 'EB'.
|
||||
|
||||
In the following example spec we set the retention time to 1 year and the size to 1GB.
|
||||
|
||||
.. code-block:: yaml
|
||||
|
||||
service_type: prometheus
|
||||
placement:
|
||||
count: 1
|
||||
spec:
|
||||
retention_time: "1y"
|
||||
retention_size: "1GB"
|
||||
|
||||
.. note::
|
||||
|
||||
If you already had Prometheus daemon(s) deployed before and are updating an
|
||||
existent spec as opposed to doing a fresh Prometheus deployment, you must also
|
||||
tell cephadm to redeploy the Prometheus daemon(s) to put this change into effect.
|
||||
This can be done with a ``ceph orch redeploy prometheus`` command.
|
||||
|
||||
Setting up Grafana
|
||||
------------------
|
||||
|
||||
|
@ -301,6 +301,7 @@ Expected output::
|
||||
|
||||
This resets the initial state of the OSD and takes it off the removal queue.
|
||||
|
||||
.. _cephadm-replacing-an-osd:
|
||||
|
||||
Replacing an OSD
|
||||
----------------
|
||||
@ -913,6 +914,57 @@ It is also possible to specify directly device paths in specific hosts like the
|
||||
|
||||
This can easily be done with other filters, like `size` or `vendor` as well.
|
||||
|
||||
It's possible to specify the `crush_device_class` parameter within the
|
||||
DriveGroup spec, and it's applied to all the devices defined by the `paths`
|
||||
keyword:
|
||||
|
||||
.. code-block:: yaml
|
||||
|
||||
service_type: osd
|
||||
service_id: osd_using_paths
|
||||
placement:
|
||||
hosts:
|
||||
- Node01
|
||||
- Node02
|
||||
crush_device_class: ssd
|
||||
spec:
|
||||
data_devices:
|
||||
paths:
|
||||
- /dev/sdb
|
||||
- /dev/sdc
|
||||
db_devices:
|
||||
paths:
|
||||
- /dev/sdd
|
||||
wal_devices:
|
||||
paths:
|
||||
- /dev/sde
|
||||
|
||||
The `crush_device_class` parameter, however, can be defined for each OSD passed
|
||||
using the `paths` keyword with the following syntax:
|
||||
|
||||
.. code-block:: yaml
|
||||
|
||||
service_type: osd
|
||||
service_id: osd_using_paths
|
||||
placement:
|
||||
hosts:
|
||||
- Node01
|
||||
- Node02
|
||||
crush_device_class: ssd
|
||||
spec:
|
||||
data_devices:
|
||||
paths:
|
||||
- path: /dev/sdb
|
||||
crush_device_class: ssd
|
||||
- path: /dev/sdc
|
||||
crush_device_class: nvme
|
||||
db_devices:
|
||||
paths:
|
||||
- /dev/sdd
|
||||
wal_devices:
|
||||
paths:
|
||||
- /dev/sde
|
||||
|
||||
.. _cephadm-osd-activate:
|
||||
|
||||
Activate existing OSDs
|
||||
|
@ -164,8 +164,10 @@ for RGW with a minimum set of configuration options. The orchestrator will
|
||||
deploy and manage a combination of haproxy and keepalived to provide load
|
||||
balancing on a floating virtual IP.
|
||||
|
||||
If SSL is used, then SSL must be configured and terminated by the ingress service
|
||||
and not RGW itself.
|
||||
If the RGW service is configured with SSL enabled, then the ingress service
|
||||
will use the `ssl` and `verify none` options in the backend configuration.
|
||||
Trust verification is disabled because the backends are accessed by IP
|
||||
address instead of FQDN.
|
||||
|
||||
.. image:: ../../images/HAProxy_for_RGW.svg
|
||||
|
||||
@ -186,8 +188,7 @@ between all the RGW daemons available.
|
||||
Prerequisites
|
||||
-------------
|
||||
|
||||
* An existing RGW service, without SSL. (If you want SSL service, the certificate
|
||||
should be configured on the ingress service, not the RGW service.)
|
||||
* An existing RGW service.
|
||||
|
||||
Deploying
|
||||
---------
|
||||
|
@ -1,22 +1,19 @@
|
||||
Troubleshooting
|
||||
===============
|
||||
|
||||
You might need to investigate why a cephadm command failed
|
||||
You may wish to investigate why a cephadm command failed
|
||||
or why a certain service no longer runs properly.
|
||||
|
||||
Cephadm deploys daemons as containers. This means that
|
||||
troubleshooting those containerized daemons might work
|
||||
differently than you expect (and that is certainly true if
|
||||
you expect this troubleshooting to work the way that
|
||||
troubleshooting does when the daemons involved aren't
|
||||
containerized).
|
||||
Cephadm deploys daemons within containers. This means that
|
||||
troubleshooting those containerized daemons will require
|
||||
a different process than traditional package-install daemons.
|
||||
|
||||
Here are some tools and commands to help you troubleshoot
|
||||
your Ceph environment.
|
||||
|
||||
.. _cephadm-pause:
|
||||
|
||||
Pausing or disabling cephadm
|
||||
Pausing or Disabling cephadm
|
||||
----------------------------
|
||||
|
||||
If something goes wrong and cephadm is behaving badly, you can
|
||||
@ -45,16 +42,15 @@ See :ref:`cephadm-spec-unmanaged` for information on disabling
|
||||
individual services.
|
||||
|
||||
|
||||
Per-service and per-daemon events
|
||||
Per-service and Per-daemon Events
|
||||
---------------------------------
|
||||
|
||||
In order to help with the process of debugging failed daemon
|
||||
deployments, cephadm stores events per service and per daemon.
|
||||
In order to facilitate debugging failed daemons,
|
||||
cephadm stores events per service and per daemon.
|
||||
These events often contain information relevant to
|
||||
troubleshooting
|
||||
your Ceph cluster.
|
||||
troubleshooting your Ceph cluster.
|
||||
|
||||
Listing service events
|
||||
Listing Service Events
|
||||
~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
To see the events associated with a certain service, run a
|
||||
@ -82,7 +78,7 @@ This will return something in the following form:
|
||||
- '2021-02-01T12:09:25.264584 service:alertmanager [ERROR] "Failed to apply: Cannot
|
||||
place <AlertManagerSpec for service_name=alertmanager> on unknown_host: Unknown hosts"'
|
||||
|
||||
Listing daemon events
|
||||
Listing Daemon Events
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
To see the events associated with a certain daemon, run a
|
||||
@ -106,16 +102,16 @@ This will return something in the following form:
|
||||
mds.cephfs.hostname.ppdhsz on host 'hostname'"
|
||||
|
||||
|
||||
Checking cephadm logs
|
||||
Checking Cephadm Logs
|
||||
---------------------
|
||||
|
||||
To learn how to monitor the cephadm logs as they are generated, read :ref:`watching_cephadm_logs`.
|
||||
To learn how to monitor cephadm logs as they are generated, read :ref:`watching_cephadm_logs`.
|
||||
|
||||
If your Ceph cluster has been configured to log events to files, there will exist a
|
||||
cephadm log file called ``ceph.cephadm.log`` on all monitor hosts (see
|
||||
:ref:`cephadm-logs` for a more complete explanation of this).
|
||||
If your Ceph cluster has been configured to log events to files, there will be a
|
||||
``ceph.cephadm.log`` file on all monitor hosts (see
|
||||
:ref:`cephadm-logs` for a more complete explanation).
|
||||
|
||||
Gathering log files
|
||||
Gathering Log Files
|
||||
-------------------
|
||||
|
||||
Use journalctl to gather the log files of all daemons:
|
||||
@ -140,7 +136,7 @@ To fetch all log files of all daemons on a given host, run::
|
||||
cephadm logs --fsid <fsid> --name "$name" > $name;
|
||||
done
|
||||
|
||||
Collecting systemd status
|
||||
Collecting Systemd Status
|
||||
-------------------------
|
||||
|
||||
To print the state of a systemd unit, run::
|
||||
@ -156,7 +152,7 @@ To fetch all state of all daemons of a given host, run::
|
||||
done
|
||||
|
||||
|
||||
List all downloaded container images
|
||||
List all Downloaded Container Images
|
||||
------------------------------------
|
||||
|
||||
To list all container images that are downloaded on a host:
|
||||
@ -170,16 +166,16 @@ To list all container images that are downloaded on a host:
|
||||
"registry.opensuse.org/opensuse/leap:15.2"
|
||||
|
||||
|
||||
Manually running containers
|
||||
Manually Running Containers
|
||||
---------------------------
|
||||
|
||||
Cephadm writes small wrappers that run a containers. Refer to
|
||||
Cephadm uses small wrappers when running containers. Refer to
|
||||
``/var/lib/ceph/<cluster-fsid>/<service-name>/unit.run`` for the
|
||||
container execution command.
|
||||
|
||||
.. _cephadm-ssh-errors:
|
||||
|
||||
ssh errors
|
||||
SSH Errors
|
||||
----------
|
||||
|
||||
Error message::
|
||||
@ -191,7 +187,7 @@ Error message::
|
||||
Please make sure that the host is reachable and accepts connections using the cephadm SSH key
|
||||
...
|
||||
|
||||
Things users can do:
|
||||
Things Ceph administrators can do:
|
||||
|
||||
1. Ensure cephadm has an SSH identity key::
|
||||
|
||||
@ -208,7 +204,7 @@ Things users can do:
|
||||
|
||||
[root@mon1 ~]# cat ~/cephadm_private_key | cephadm shell -- ceph cephadm set-ssk-key -i -
|
||||
|
||||
2. Ensure that the ssh config is correct::
|
||||
2. Ensure that the SSH config is correct::
|
||||
|
||||
[root@mon1 ~]# cephadm shell -- ceph cephadm get-ssh-config > config
|
||||
|
||||
@ -224,7 +220,7 @@ To verify that the public key is in the authorized_keys file, run the following
|
||||
[root@mon1 ~]# cephadm shell -- ceph cephadm get-pub-key > ~/ceph.pub
|
||||
[root@mon1 ~]# grep "`cat ~/ceph.pub`" /root/.ssh/authorized_keys
|
||||
|
||||
Failed to infer CIDR network error
|
||||
Failed to Infer CIDR network error
|
||||
----------------------------------
|
||||
|
||||
If you see this error::
|
||||
@ -241,7 +237,7 @@ This means that you must run a command of this form::
|
||||
|
||||
For more detail on operations of this kind, see :ref:`deploy_additional_monitors`
|
||||
|
||||
Accessing the admin socket
|
||||
Accessing the Admin Socket
|
||||
--------------------------
|
||||
|
||||
Each Ceph daemon provides an admin socket that bypasses the
|
||||
@ -252,12 +248,12 @@ To access the admin socket, first enter the daemon container on the host::
|
||||
[root@mon1 ~]# cephadm enter --name <daemon-name>
|
||||
[ceph: root@mon1 /]# ceph --admin-daemon /var/run/ceph/ceph-<daemon-name>.asok config show
|
||||
|
||||
Calling miscellaneous ceph tools
|
||||
Running Various Ceph Tools
|
||||
--------------------------------
|
||||
|
||||
To call miscellaneous like ``ceph-objectstore-tool`` or
|
||||
``ceph-monstore-tool``, you can run them by calling
|
||||
``cephadm shell --name <daemon-name>`` like so::
|
||||
To run Ceph tools like ``ceph-objectstore-tool`` or
|
||||
``ceph-monstore-tool``, invoke the cephadm CLI with
|
||||
``cephadm shell --name <daemon-name>``. For example::
|
||||
|
||||
root@myhostname # cephadm unit --name mon.myhostname stop
|
||||
root@myhostname # cephadm shell --name mon.myhostname
|
||||
@ -272,21 +268,21 @@ To call miscellaneous like ``ceph-objectstore-tool`` or
|
||||
election_strategy: 1
|
||||
0: [v2:127.0.0.1:3300/0,v1:127.0.0.1:6789/0] mon.myhostname
|
||||
|
||||
This command sets up the environment in a way that is suitable
|
||||
for extended daemon maintenance and running the daemon interactively.
|
||||
The cephadm shell sets up the environment in a way that is suitable
|
||||
for extended daemon maintenance and running daemons interactively.
|
||||
|
||||
.. _cephadm-restore-quorum:
|
||||
|
||||
Restoring the MON quorum
|
||||
------------------------
|
||||
Restoring the Monitor Quorum
|
||||
----------------------------
|
||||
|
||||
In case the Ceph MONs cannot form a quorum, cephadm is not able
|
||||
to manage the cluster, until the quorum is restored.
|
||||
If the Ceph monitor daemons (mons) cannot form a quorum, cephadm will not be
|
||||
able to manage the cluster until quorum is restored.
|
||||
|
||||
In order to restore the MON quorum, remove unhealthy MONs
|
||||
In order to restore the quorum, remove unhealthy monitors
|
||||
form the monmap by following these steps:
|
||||
|
||||
1. Stop all MONs. For each MON host::
|
||||
1. Stop all mons. For each mon host::
|
||||
|
||||
ssh {mon-host}
|
||||
cephadm unit --name mon.`hostname` stop
|
||||
@ -301,18 +297,19 @@ form the monmap by following these steps:
|
||||
|
||||
.. _cephadm-manually-deploy-mgr:
|
||||
|
||||
Manually deploying a MGR daemon
|
||||
-------------------------------
|
||||
cephadm requires a MGR daemon in order to manage the cluster. In case the cluster
|
||||
the last MGR of a cluster was removed, follow these steps in order to deploy
|
||||
a MGR ``mgr.hostname.smfvfd`` on a random host of your cluster manually.
|
||||
Manually Deploying a Manager Daemon
|
||||
-----------------------------------
|
||||
At least one manager (mgr) daemon is required by cephadm in order to manage the
|
||||
cluster. If the last mgr in a cluster has been removed, follow these steps in
|
||||
order to deploy a manager called (for example)
|
||||
``mgr.hostname.smfvfd`` on a random host of your cluster manually.
|
||||
|
||||
Disable the cephadm scheduler, in order to prevent cephadm from removing the new
|
||||
MGR. See :ref:`cephadm-enable-cli`::
|
||||
manager. See :ref:`cephadm-enable-cli`::
|
||||
|
||||
ceph config-key set mgr/cephadm/pause true
|
||||
|
||||
Then get or create the auth entry for the new MGR::
|
||||
Then get or create the auth entry for the new manager::
|
||||
|
||||
ceph auth get-or-create mgr.hostname.smfvfd mon "profile mgr" osd "allow *" mds "allow *"
|
||||
|
||||
@ -338,26 +335,26 @@ Deploy the daemon::
|
||||
|
||||
cephadm --image <container-image> deploy --fsid <fsid> --name mgr.hostname.smfvfd --config-json config-json.json
|
||||
|
||||
Analyzing core dumps
|
||||
Analyzing Core Dumps
|
||||
---------------------
|
||||
|
||||
In case a Ceph daemon crashes, cephadm supports analyzing core dumps. To enable core dumps, run
|
||||
When a Ceph daemon crashes, cephadm supports analyzing core dumps. To enable core dumps, run
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
ulimit -c unlimited
|
||||
|
||||
core dumps will now be written to ``/var/lib/systemd/coredump``.
|
||||
Core dumps will now be written to ``/var/lib/systemd/coredump``.
|
||||
|
||||
.. note::
|
||||
|
||||
core dumps are not namespaced by the kernel, which means
|
||||
Core dumps are not namespaced by the kernel, which means
|
||||
they will be written to ``/var/lib/systemd/coredump`` on
|
||||
the container host.
|
||||
|
||||
Now, wait for the crash to happen again. (To simulate the crash of a daemon, run e.g. ``killall -3 ceph-mon``)
|
||||
Now, wait for the crash to happen again. To simulate the crash of a daemon, run e.g. ``killall -3 ceph-mon``.
|
||||
|
||||
Install debug packages by entering the cephadm shell and install ``ceph-debuginfo``::
|
||||
Install debug packages including ``ceph-debuginfo`` by entering the cephadm shelll::
|
||||
|
||||
# cephadm shell --mount /var/lib/systemd/coredump
|
||||
[ceph: root@host1 /]# dnf install ceph-debuginfo gdb zstd
|
||||
|
@ -2,29 +2,6 @@
|
||||
Upgrading Ceph
|
||||
==============
|
||||
|
||||
.. DANGER:: DATE: 01 NOV 2021.
|
||||
|
||||
DO NOT UPGRADE TO CEPH PACIFIC FROM AN OLDER VERSION.
|
||||
|
||||
A recently-discovered bug (https://tracker.ceph.com/issues/53062) can cause
|
||||
data corruption. This bug occurs during OMAP format conversion for
|
||||
clusters that are updated to Pacific. New clusters are not affected by this
|
||||
bug.
|
||||
|
||||
The trigger for this bug is BlueStore's repair/quick-fix functionality. This
|
||||
bug can be triggered in two known ways:
|
||||
|
||||
(1) manually via the ceph-bluestore-tool, or
|
||||
(2) automatically, by OSD if ``bluestore_fsck_quick_fix_on_mount`` is set
|
||||
to true.
|
||||
|
||||
The fix for this bug is expected to be available in Ceph v16.2.7.
|
||||
|
||||
DO NOT set ``bluestore_quick_fix_on_mount`` to true. If it is currently
|
||||
set to true in your configuration, immediately set it to false.
|
||||
|
||||
DO NOT run ``ceph-bluestore-tool``'s repair/quick-fix commands.
|
||||
|
||||
Cephadm can safely upgrade Ceph from one bugfix release to the next. For
|
||||
example, you can upgrade from v15.2.0 (the first Octopus release) to the next
|
||||
point release, v15.2.1.
|
||||
@ -199,7 +176,7 @@ Staggered Upgrade
|
||||
=================
|
||||
|
||||
Some users may prefer to upgrade components in phases rather than all at once.
|
||||
The upgrade command, starting in 16.2.10 and 17.2.1 allows parameters
|
||||
The upgrade command, starting in 16.2.11 and 17.2.1 allows parameters
|
||||
to limit which daemons are upgraded by a single upgrade command. The options in
|
||||
include ``daemon_types``, ``services``, ``hosts`` and ``limit``. ``daemon_types``
|
||||
takes a comma-separated list of daemon types and will only upgrade daemons of those
|
||||
|
@ -1,3 +1,5 @@
|
||||
.. _cephfs_add_remote_mds:
|
||||
|
||||
.. note::
|
||||
It is highly recommended to use :doc:`/cephadm/index` or another Ceph
|
||||
orchestrator for setting up the ceph cluster. Use this approach only if you
|
||||
|
Binary file not shown.
Before Width: | Height: | Size: 14 KiB After Width: | Height: | Size: 40 KiB |
@ -9,7 +9,7 @@ in realtime. `cephfs-top` is a curses based python script which makes use of `st
|
||||
plugin in Ceph Manager to fetch (and display) metrics.
|
||||
|
||||
Manager Plugin
|
||||
--------------
|
||||
==============
|
||||
|
||||
Ceph Filesystem clients periodically forward various metrics to Ceph Metadata Servers (MDS)
|
||||
which in turn get forwarded to Ceph Manager by MDS rank zero. Each active MDS forward its
|
||||
@ -29,9 +29,8 @@ metrics are for a particular MDS rank (e.g., number of subtrees handled by an MD
|
||||
Once enabled, Ceph Filesystem metrics can be fetched via::
|
||||
|
||||
$ ceph fs perf stats
|
||||
{"version": 1, "global_counters": ["cap_hit", "read_latency", "write_latency", "metadata_latency", "dentry_lease", "opened_files", "pinned_icaps", "opened_inodes", "avg_read_latency", "stdev_read_latency", "avg_write_latency", "stdev_write_latency", "avg_metadata_latency", "stdev_metadata_latency"], "counters": [], "client_metadata": {"client.324130": {"IP": "192.168.1.100", "hostname": "ceph-host1", "root": "/", "mount_point": "/mnt/cephfs", "valid_metrics": ["cap_hit", "read_latency", "write_latency", "metadata_latency", "dentry_lease, "opened_files", "pinned_icaps", "opened_inodes", "avg_read_latency", "stdev_read_latency", "avg_write_latency", "stdev_write_latency", "avg_metadata_latency", "stdev_metadata_latency"]}}, "global_metrics": {"client.324130": [[309785, 1280], [0, 0], [197, 519015022], [88, 279074768], [12, 70147], [0, 3], [3, 3], [0, 3], [0, 0], [0, 0], [0, 11699223], [0, 88245], [0, 6596951], [0, 9539]]}, "metrics": {"delayed_ranks": [], "mds.0": {"client.324130": []}}}
|
||||
|
||||
Details of the JSON command output are as follows:
|
||||
The output format is JSON and contains fields as follows:
|
||||
|
||||
- `version`: Version of stats output
|
||||
- `global_counters`: List of global performance metrics
|
||||
@ -54,7 +53,7 @@ To fetch metrics only for a subset of active MDSs (e.g., MDS rank 1 and 2)::
|
||||
$ ceph fs perf stats --mds_rank=1,2
|
||||
|
||||
`cephfs-top`
|
||||
------------
|
||||
============
|
||||
|
||||
`cephfs-top` utility relies on `stats` plugin to fetch performance metrics and display in
|
||||
`top(1)` like format. `cephfs-top` is available as part of `cephfs-top` package.
|
||||
@ -64,6 +63,9 @@ By default, `cephfs-top` uses `client.fstop` user to connect to a Ceph cluster::
|
||||
$ ceph auth get-or-create client.fstop mon 'allow r' mds 'allow r' osd 'allow r' mgr 'allow r'
|
||||
$ cephfs-top
|
||||
|
||||
Command-Line Options
|
||||
--------------------
|
||||
|
||||
To use a non-default user (other than `client.fstop`) use::
|
||||
|
||||
$ cephfs-top --id <name>
|
||||
@ -78,8 +80,27 @@ By default, `cephfs-top` connects to cluster name `ceph`. To use a non-default c
|
||||
|
||||
Interval should be greater than or equal to 0.5 seconds. Fractional seconds are honoured.
|
||||
|
||||
Sample screenshot running `cephfs-top` with 2 clients:
|
||||
Interactive Commands
|
||||
--------------------
|
||||
|
||||
1. m : Filesystem selection
|
||||
Displays a menu of filesystems for selection.
|
||||
|
||||
2. s : Sort field selection
|
||||
Designates the sort field. 'cap_hit' is the default.
|
||||
|
||||
3. l : Client limit
|
||||
Sets the limit on the number of clients to be displayed.
|
||||
|
||||
4. r : Reset
|
||||
Resets the sort field and limit value to the default.
|
||||
|
||||
5. q : Quit
|
||||
Exit the utility if you are at the home screen (all filesystem info),
|
||||
otherwise escape back to the home screen.
|
||||
|
||||
The metrics display can be scrolled using the Arrow Keys, PgUp/PgDn, Home/End and mouse.
|
||||
|
||||
Sample screenshot running `cephfs-top` with 2 filesystems:
|
||||
|
||||
.. image:: cephfs-top.png
|
||||
|
||||
.. note:: As of now, `cephfs-top` does not reliably work with multiple Ceph Filesystems.
|
||||
|
@ -300,9 +300,8 @@ Ensure you have an MDS running and issue:
|
||||
|
||||
.. note::
|
||||
|
||||
Symbolic links are recovered as empty regular files. `Symbolic link recovery
|
||||
<https://tracker.ceph.com/issues/46166>`_ is scheduled to be supported in
|
||||
Pacific.
|
||||
The `Symbolic link recovery <https://tracker.ceph.com/issues/46166>`_ is supported from Quincy.
|
||||
Symbolic links were recovered as empty regular files before.
|
||||
|
||||
It is recommended to migrate any data from the recovery file system as soon as
|
||||
possible. Do not restore the old file system while the recovery file system is
|
||||
|
@ -99,6 +99,22 @@ expected to be disabled on the volume.
|
||||
|
||||
Fetch the information of a CephFS volume using::
|
||||
|
||||
$ ceph fs volume info vol_name [--human_readable]
|
||||
|
||||
The ``--human_readable`` flag shows used and available pool capacities in KB/MB/GB.
|
||||
|
||||
The output format is JSON and contains fields as follows:
|
||||
|
||||
* pools: Attributes of data and metadata pools
|
||||
* avail: The amount of free space available in bytes
|
||||
* used: The amount of storage consumed in bytes
|
||||
* name: Name of the pool
|
||||
* mon_addrs: List of monitor addresses
|
||||
* used_size: Current used size of the CephFS volume in bytes
|
||||
* pending_subvolume_deletions: Number of subvolumes pending deletion
|
||||
|
||||
Sample output of volume info command::
|
||||
|
||||
$ ceph fs volume info vol_name
|
||||
{
|
||||
"mon_addrs": [
|
||||
@ -124,16 +140,6 @@ Fetch the information of a CephFS volume using::
|
||||
"used_size": 0
|
||||
}
|
||||
|
||||
The output format is json and contains fields as follows.
|
||||
|
||||
* pools: Attributes of data and metadata pools
|
||||
* avail: The amount of free space available in bytes
|
||||
* used: The amount of storage consumed in bytes
|
||||
* name: Name of the pool
|
||||
* mon_addrs: List of monitor addresses
|
||||
* used_size: Current used size of the CephFS volume in bytes
|
||||
* pending_subvolume_deletions: Number of subvolumes pending deletion
|
||||
|
||||
FS Subvolume groups
|
||||
-------------------
|
||||
|
||||
@ -194,6 +200,7 @@ Check the presence of any subvolume group using::
|
||||
$ ceph fs subvolumegroup exist <vol_name>
|
||||
|
||||
The strings returned by the 'exist' command:
|
||||
|
||||
* "subvolumegroup exists": if any subvolumegroup is present
|
||||
* "no subvolumegroup exists": if no subvolumegroup is present
|
||||
|
||||
@ -344,6 +351,7 @@ Check the presence of any subvolume using::
|
||||
$ ceph fs subvolume exist <vol_name> [--group_name <subvol_group_name>]
|
||||
|
||||
The strings returned by the 'exist' command:
|
||||
|
||||
* "subvolume exists": if any subvolume of given group_name is present
|
||||
* "no subvolume exists": if no subvolume of given group_name is present
|
||||
|
||||
|
@ -53,7 +53,8 @@ If you have more than one FS on your Ceph cluster, use the option
|
||||
|
||||
ceph-fuse --id foo --client_fs mycephfs2 /mnt/mycephfs2
|
||||
|
||||
You may also add a ``client_fs`` setting to your ``ceph.conf``
|
||||
You may also add a ``client_fs`` setting to your ``ceph.conf``. Alternatively, the option
|
||||
``--client_mds_namespace`` is supported for backward compatibility.
|
||||
|
||||
Unmounting CephFS
|
||||
=================
|
||||
|
@ -103,6 +103,28 @@ To mount a subtree of the CephFS root, append the path to the device string::
|
||||
|
||||
mount -t ceph cephuser@.cephfs=/subvolume/dir1/dir2 /mnt/mycephfs -o secretfile=/etc/ceph/cephuser.secret
|
||||
|
||||
Backward Compatibility
|
||||
======================
|
||||
The old syntax is supported for backward compatibility.
|
||||
|
||||
To mount CephFS with the kernel driver::
|
||||
|
||||
mkdir /mnt/mycephfs
|
||||
mount -t ceph :/ /mnt/mycephfs -o name=admin
|
||||
|
||||
The key-value argument right after option ``-o`` is CephX credential;
|
||||
``name`` is the username of the CephX user we are using to mount CephFS.
|
||||
|
||||
To mount a non-default FS ``cephfs2``, in case the cluster has multiple FSs::
|
||||
|
||||
mount -t ceph :/ /mnt/mycephfs -o name=admin,fs=cephfs2
|
||||
|
||||
or
|
||||
|
||||
mount -t ceph :/ /mnt/mycephfs -o name=admin,mds_namespace=cephfs2
|
||||
|
||||
.. note:: The option ``mds_namespace`` is deprecated. Use ``fs=`` instead when using the old syntax for mounting.
|
||||
|
||||
Unmounting CephFS
|
||||
=================
|
||||
To unmount the Ceph file system, use the ``umount`` command as usual::
|
||||
|
@ -31,7 +31,7 @@ POSIX semantics for various reasons:
|
||||
writes are not coherently propagated to other clients' caches. That
|
||||
is, if a page is cached on host A, and then updated on host B, host
|
||||
A's page is not coherently invalidated. (Shared writable mmap
|
||||
appears to be quite rare--we have yet to here any complaints about this
|
||||
appears to be quite rare--we have yet to hear any complaints about this
|
||||
behavior, and implementing cache coherency properly is complex.)
|
||||
- CephFS clients present a hidden ``.snap`` directory that is used to
|
||||
access, create, delete, and rename snapshots. Although the virtual
|
||||
@ -62,17 +62,15 @@ as client data may not even be flushed to the server until the file is
|
||||
closed (and more generally writes will be significantly more
|
||||
time-shifted than CephFS, leading to less predictable results).
|
||||
|
||||
However, all of there are very close to POSIX, and most of the time
|
||||
applications don't notice too much. Many other storage systems (e.g.,
|
||||
HDFS) claim to be "POSIX-like" but diverge significantly from the
|
||||
standard by dropping support for things like in-place file
|
||||
modifications, truncate, or directory renames.
|
||||
|
||||
Regardless, these are all similar enough to POSIX, and applications still work
|
||||
most of the time. Many other storage systems (e.g., HDFS) claim to be
|
||||
"POSIX-like" but diverge significantly from the standard by dropping support
|
||||
for things like in-place file modifications, truncate, or directory renames.
|
||||
|
||||
Bottom line
|
||||
-----------
|
||||
|
||||
CephFS relaxes more than local Linux kernel file systems (e.g., writes
|
||||
CephFS relaxes more than local Linux kernel file systems (for example, writes
|
||||
spanning object boundaries may be torn). It relaxes strictly less
|
||||
than NFS when it comes to multiclient consistency, and generally less
|
||||
than NFS when it comes to write atomicity.
|
||||
|
@ -5,6 +5,60 @@ CephFS allows quotas to be set on any directory in the system. The
|
||||
quota can restrict the number of *bytes* or the number of *files*
|
||||
stored beneath that point in the directory hierarchy.
|
||||
|
||||
Like most other things in CephFS, quotas are configured using virtual
|
||||
extended attributes:
|
||||
|
||||
* ``ceph.quota.max_files`` -- file limit
|
||||
* ``ceph.quota.max_bytes`` -- byte limit
|
||||
|
||||
If the extended attributes appear on a directory that means a quota is
|
||||
configured there. If they are not present then no quota is set on that
|
||||
directory (although one may still be configured on a parent directory).
|
||||
|
||||
To set a quota, set the extended attribute on a CephFS directory with a
|
||||
value::
|
||||
|
||||
setfattr -n ceph.quota.max_bytes -v 100000000 /some/dir # 100 MB
|
||||
setfattr -n ceph.quota.max_files -v 10000 /some/dir # 10,000 files
|
||||
|
||||
To view quota limit::
|
||||
|
||||
$ getfattr -n ceph.quota.max_bytes /some/dir
|
||||
# file: dir1/
|
||||
ceph.quota.max_bytes="100000000"
|
||||
$
|
||||
$ getfattr -n ceph.quota.max_files /some/dir
|
||||
# file: dir1/
|
||||
ceph.quota.max_files="10000"
|
||||
|
||||
.. note:: Running ``getfattr /some/dir -d -m -`` for a CephFS directory will
|
||||
print none of the CephFS extended attributes. This is because the CephFS
|
||||
kernel and FUSE clients hide this information from the ``listxattr(2)``
|
||||
system call. Instead, a specific CephFS extended attribute can be viewed by
|
||||
running ``getfattr /some/dir -n ceph.<some-xattr>``.
|
||||
|
||||
To remove a quota, set the value of extended attribute to ``0``::
|
||||
|
||||
$ setfattr -n ceph.quota.max_bytes -v 0 /some/dir
|
||||
$ getfattr /some/dir -n ceph.quota.max_bytes
|
||||
dir1/: ceph.quota.max_bytes: No such attribute
|
||||
$
|
||||
$ setfattr -n ceph.quota.max_files -v 0 /some/dir
|
||||
$ getfattr dir1/ -n ceph.quota.max_files
|
||||
dir1/: ceph.quota.max_files: No such attribute
|
||||
|
||||
Space Usage Reporting and CephFS Quotas
|
||||
---------------------------------------
|
||||
When the root directory of the CephFS mount has quota set on it, the available
|
||||
space on the CephFS reported by space usage report tools (like ``df``) is
|
||||
based on quota limit. That is, ``available space = quota limit - used space``
|
||||
instead of ``available space = total space - used space``.
|
||||
|
||||
This behaviour can be disabled by setting following option in client section
|
||||
of ``ceph.conf``::
|
||||
|
||||
client quota df = false
|
||||
|
||||
Limitations
|
||||
-----------
|
||||
|
||||
@ -86,3 +140,11 @@ To remove a quota::
|
||||
|
||||
setfattr -n ceph.quota.max_bytes -v 0 /some/dir
|
||||
setfattr -n ceph.quota.max_files -v 0 /some/dir
|
||||
|
||||
|
||||
.. note:: In cases where CephFS extended attributes are set on a CephFS
|
||||
directory (for example, ``/some/dir``), running ``getfattr /some/dir -d -m
|
||||
-`` will not print those CephFS extended attributes. This is because CephFS
|
||||
kernel and FUSE clients hide this information from the ``listxattr(2)``
|
||||
system call. You can access a specific CephFS extended attribute by running
|
||||
``getfattr /some/dir -n ceph.<some-xattr>`` instead.
|
||||
|
@ -131,3 +131,15 @@ Control (ongoing) File System Scrubs
|
||||
{
|
||||
"return_code": 0
|
||||
}
|
||||
|
||||
Damages
|
||||
=======
|
||||
|
||||
The types of damage that can be reported and repaired by File System Scrub are:
|
||||
|
||||
* DENTRY : Inode's dentry is missing.
|
||||
|
||||
* DIR_FRAG : Inode's directory fragment(s) is missing.
|
||||
|
||||
* BACKTRACE : Inode's backtrace in the data pool is corrupted.
|
||||
|
||||
|
@ -49,10 +49,9 @@ The following time periods are recognized: `h(our), d(ay), w(eek), m(onth),
|
||||
y(ear)` and `n`. The latter is a special modifier where e.g. `10n` means keep
|
||||
the last 10 snapshots regardless of timing,
|
||||
|
||||
All subcommands take optional `fs` and `subvol` arguments to specify paths in
|
||||
All subcommands take optional `fs` argument to specify paths in
|
||||
multi-fs setups and :doc:`/cephfs/fs-volumes` managed setups. If not
|
||||
passed `fs` defaults to the first file system listed in the fs_map, `subvolume`
|
||||
defaults to nothing.
|
||||
passed `fs` defaults to the first file system listed in the fs_map.
|
||||
When using :doc:`/cephfs/fs-volumes` the argument `fs` is equivalent to a
|
||||
`volume`.
|
||||
|
||||
@ -66,16 +65,21 @@ When no subcommand is supplied a synopsis is printed::
|
||||
|
||||
#> ceph fs snap-schedule
|
||||
no valid command found; 8 closest matches:
|
||||
fs snap-schedule status [<path>] [<subvol>] [<fs>] [<format>]
|
||||
fs snap-schedule list <path> [<subvol>] [--recursive] [<fs>] [<format>]
|
||||
fs snap-schedule add <path> <snap_schedule> [<start>] [<fs>] [<subvol>]
|
||||
fs snap-schedule remove <path> [<repeat>] [<start>] [<subvol>] [<fs>]
|
||||
fs snap-schedule retention add <path> <retention_spec_or_period> [<retention_count>] [<fs>] [<subvol>]
|
||||
fs snap-schedule retention remove <path> <retention_spec_or_period> [<retention_count>] [<fs>] [<subvol>]
|
||||
fs snap-schedule activate <path> [<repeat>] [<start>] [<subvol>] [<fs>]
|
||||
fs snap-schedule deactivate <path> [<repeat>] [<start>] [<subvol>] [<fs>]
|
||||
fs snap-schedule status [<path>] [<fs>] [<format>]
|
||||
fs snap-schedule list <path> [--recursive] [<fs>] [<format>]
|
||||
fs snap-schedule add <path> <snap_schedule> [<start>] [<fs>]
|
||||
fs snap-schedule remove <path> [<repeat>] [<start>] [<fs>]
|
||||
fs snap-schedule retention add <path> <retention_spec_or_period> [<retention_count>] [<fs>]
|
||||
fs snap-schedule retention remove <path> <retention_spec_or_period> [<retention_count>] [<fs>]
|
||||
fs snap-schedule activate <path> [<repeat>] [<start>] [<fs>]
|
||||
fs snap-schedule deactivate <path> [<repeat>] [<start>] [<fs>]
|
||||
Error EINVAL: invalid command
|
||||
|
||||
Note:
|
||||
^^^^^
|
||||
A `subvolume` argument is no longer accepted by the commands.
|
||||
|
||||
|
||||
Inspect snapshot schedules
|
||||
--------------------------
|
||||
|
||||
|
@ -554,7 +554,7 @@ In order to configure connections (from Ceph nodes) to the KDC:
|
||||
...
|
||||
|
||||
|
||||
6. A new *set parameter* was added in Ceph, ``gss ktab client file`` which
|
||||
6. A new *set parameter* was added in Ceph, ``gss_ktab_client_file`` which
|
||||
points to the keytab file related to the Ceph node *(or principal)* in
|
||||
question.
|
||||
|
||||
@ -614,10 +614,10 @@ In order to configure connections (from Ceph nodes) to the KDC:
|
||||
/etc/ceph/ceph.conf
|
||||
[global]
|
||||
...
|
||||
auth cluster required = gss
|
||||
auth service required = gss
|
||||
auth client required = gss
|
||||
gss ktab client file = /{$my_new_location}/{$my_new_ktab_client_file.keytab}
|
||||
auth_cluster_required = gss
|
||||
auth_service_required = gss
|
||||
auth_client_required = gss
|
||||
gss_ktab_client_file = /{$my_new_location}/{$my_new_ktab_client_file.keytab}
|
||||
...
|
||||
|
||||
|
||||
|
@ -32,7 +32,7 @@ cephadm/cephadm script into memory.)
|
||||
for mon or mgr.
|
||||
- You'll see health warnings from cephadm about stray daemons--that's because
|
||||
the vstart-launched daemons aren't controlled by cephadm.
|
||||
- The default image is ``quay.io/ceph-ci/ceph:master``, but you can change
|
||||
- The default image is ``quay.io/ceph-ci/ceph:main``, but you can change
|
||||
this by passing ``-o container_image=...`` or ``ceph config set global container_image ...``.
|
||||
|
||||
|
||||
|
@ -32,8 +32,8 @@ The ceph orch command will be extended to support maintenance.
|
||||
|
||||
.. code-block::
|
||||
|
||||
ceph orch host enter-maintenance <host> [ --check ]
|
||||
ceph orch host exit-maintenance <host>
|
||||
ceph orch host maintenance enter <host> [ --force ]
|
||||
ceph orch host maintenance exit <host>
|
||||
|
||||
.. note:: In addition, the host's status should be updated to reflect whether it
|
||||
is in maintenance or not.
|
||||
|
@ -131,3 +131,8 @@ sharing a single pool (via namespaces), their snapshots *will* collide and
|
||||
deleting one will result in missing file data for others. (This may even be
|
||||
invisible, not throwing errors to the user.) If each FS gets its own
|
||||
pool things probably work, but this isn't tested and may not be true.
|
||||
|
||||
.. Note:: To avoid snap id collision between mon-managed snapshots and file system
|
||||
snapshots, pools with mon-managed snapshots are not allowed to be attached
|
||||
to a file system. Also, mon-managed snapshots can't be created in pools
|
||||
already attached to a file system either.
|
||||
|
@ -1,151 +0,0 @@
|
||||
===============
|
||||
Deduplication
|
||||
===============
|
||||
|
||||
|
||||
Introduction
|
||||
============
|
||||
|
||||
Applying data deduplication on an existing software stack is not easy
|
||||
due to additional metadata management and original data processing
|
||||
procedure.
|
||||
|
||||
In a typical deduplication system, the input source as a data
|
||||
object is split into multiple chunks by a chunking algorithm.
|
||||
The deduplication system then compares each chunk with
|
||||
the existing data chunks, stored in the storage previously.
|
||||
To this end, a fingerprint index that stores the hash value
|
||||
of each chunk is employed by the deduplication system
|
||||
in order to easily find the existing chunks by comparing
|
||||
hash value rather than searching all contents that reside in
|
||||
the underlying storage.
|
||||
|
||||
There are many challenges in order to implement deduplication on top
|
||||
of Ceph. Among them, two issues are essential for deduplication.
|
||||
First is managing scalability of fingerprint index; Second is
|
||||
it is complex to ensure compatibility between newly applied
|
||||
deduplication metadata and existing metadata.
|
||||
|
||||
Key Idea
|
||||
========
|
||||
1. Content hashing (Double hashing): Each client can find an object data
|
||||
for an object ID using CRUSH. With CRUSH, a client knows object's location
|
||||
in Base tier.
|
||||
By hashing object's content at Base tier, a new OID (chunk ID) is generated.
|
||||
Chunk tier stores in the new OID that has a partial content of original object.
|
||||
|
||||
Client 1 -> OID=1 -> HASH(1's content)=K -> OID=K ->
|
||||
CRUSH(K) -> chunk's location
|
||||
|
||||
|
||||
2. Self-contained object: The external metadata design
|
||||
makes difficult for integration with storage feature support
|
||||
since existing storage features cannot recognize the
|
||||
additional external data structures. If we can design data
|
||||
deduplication system without any external component, the
|
||||
original storage features can be reused.
|
||||
|
||||
More details in https://ieeexplore.ieee.org/document/8416369
|
||||
|
||||
Design
|
||||
======
|
||||
|
||||
.. ditaa::
|
||||
|
||||
+-------------+
|
||||
| Ceph Client |
|
||||
+------+------+
|
||||
^
|
||||
Tiering is |
|
||||
Transparent | Metadata
|
||||
to Ceph | +---------------+
|
||||
Client Ops | | |
|
||||
| +----->+ Base Pool |
|
||||
| | | |
|
||||
| | +-----+---+-----+
|
||||
| | | ^
|
||||
v v | | Dedup metadata in Base Pool
|
||||
+------+----+--+ | | (Dedup metadata contains chunk offsets
|
||||
| Objecter | | | and fingerprints)
|
||||
+-----------+--+ | |
|
||||
^ | | Data in Chunk Pool
|
||||
| v |
|
||||
| +-----+---+-----+
|
||||
| | |
|
||||
+----->| Chunk Pool |
|
||||
| |
|
||||
+---------------+
|
||||
Data
|
||||
|
||||
|
||||
Pool-based object management:
|
||||
We define two pools.
|
||||
The metadata pool stores metadata objects and the chunk pool stores
|
||||
chunk objects. Since these two pools are divided based on
|
||||
the purpose and usage, each pool can be managed more
|
||||
efficiently according to its different characteristics. Base
|
||||
pool and the chunk pool can separately select a redundancy
|
||||
scheme between replication and erasure coding depending on
|
||||
its usage and each pool can be placed in a different storage
|
||||
location depending on the required performance.
|
||||
|
||||
Regarding how to use, please see ``osd_internals/manifest.rst``
|
||||
|
||||
Usage Patterns
|
||||
==============
|
||||
|
||||
The different Ceph interface layers present potentially different oportunities
|
||||
and costs for deduplication and tiering in general.
|
||||
|
||||
RadosGW
|
||||
-------
|
||||
|
||||
S3 big data workloads seem like a good opportunity for deduplication. These
|
||||
objects tend to be write once, read mostly objects which don't see partial
|
||||
overwrites. As such, it makes sense to fingerprint and dedup up front.
|
||||
|
||||
Unlike cephfs and rbd, radosgw has a system for storing
|
||||
explicit metadata in the head object of a logical s3 object for
|
||||
locating the remaining pieces. As such, radosgw could use the
|
||||
refcounting machinery (``osd_internals/refcount.rst``) directly without
|
||||
needing direct support from rados for manifests.
|
||||
|
||||
RBD/Cephfs
|
||||
----------
|
||||
|
||||
RBD and CephFS both use deterministic naming schemes to partition
|
||||
block devices/file data over rados objects. As such, the redirection
|
||||
metadata would need to be included as part of rados, presumably
|
||||
transparently.
|
||||
|
||||
Moreover, unlike radosgw, rbd/cephfs rados objects can see overwrites.
|
||||
For those objects, we don't really want to perform dedup, and we don't
|
||||
want to pay a write latency penalty in the hot path to do so anyway.
|
||||
As such, performing tiering and dedup on cold objects in the background
|
||||
is likely to be preferred.
|
||||
|
||||
One important wrinkle, however, is that both rbd and cephfs workloads
|
||||
often feature usage of snapshots. This means that the rados manifest
|
||||
support needs robust support for snapshots.
|
||||
|
||||
RADOS Machinery
|
||||
===============
|
||||
|
||||
For more information on rados redirect/chunk/dedup support, see ``osd_internals/manifest.rst``.
|
||||
For more information on rados refcount support, see ``osd_internals/refcount.rst``.
|
||||
|
||||
Status and Future Work
|
||||
======================
|
||||
|
||||
At the moment, there exists some preliminary support for manifest
|
||||
objects within the OSD as well as a dedup tool.
|
||||
|
||||
RadosGW data warehouse workloads probably represent the largest
|
||||
opportunity for this feature, so the first priority is probably to add
|
||||
direct support for fingerprinting and redirects into the refcount pool
|
||||
to radosgw.
|
||||
|
||||
Aside from radosgw, completing work on manifest object support in the
|
||||
OSD particularly as it relates to snapshots would be the next step for
|
||||
rbd and cephfs workloads.
|
||||
|
@ -87,7 +87,7 @@ The procedure for making changes to the Ceph repository is as follows:
|
||||
|
||||
#. :ref:`Push the changes in your local working copy to your fork<push_changes>`.
|
||||
|
||||
#. Create a Pull Request to push the change upstream
|
||||
#. Create a Pull Request to push the change upstream.
|
||||
|
||||
#. Create a Pull Request that asks for your changes to be added into the
|
||||
"upstream Ceph" repository.
|
||||
@ -133,8 +133,8 @@ Configuring Your Local Environment
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
The commands in this section configure your local git environment so that it
|
||||
generates "Signed-off-by:" tags. They also set up your local environment so
|
||||
that it can stay synchronized with the upstream repository.
|
||||
generates "Signed-off-by:" tags. These commands also set up your local
|
||||
environment so that it can stay synchronized with the upstream repository.
|
||||
|
||||
These commands are necessary only during the initial setup of your local
|
||||
working copy. Another way to say that is "These commands are necessary
|
||||
@ -172,7 +172,7 @@ Fixing the Bug
|
||||
Synchronizing Local Main with Upstream Main
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
In your local git environment, there is a copy of the ``main`` branch in
|
||||
In your local working copy, there is a copy of the ``main`` branch in
|
||||
``remotes/origin/main``. This is called "local main". This copy of the
|
||||
main branch (https://github.com/your_github_id/ceph.git) is "frozen in time"
|
||||
at the moment that you cloned it, but the upstream repo
|
||||
@ -184,9 +184,8 @@ Because upstream main is continually receiving updates from other
|
||||
contributors, your fork will drift farther and farther from the state of the
|
||||
upstream repo when you cloned it.
|
||||
|
||||
You must keep your fork's main branch synchronized with upstream main in
|
||||
order to reduce drift between your fork's main branch and the upstream main
|
||||
branch.
|
||||
Keep your fork's ``main`` branch synchronized with upstream main to reduce drift
|
||||
between your fork's main branch and the upstream main branch.
|
||||
|
||||
Here are the commands for keeping your fork synchronized with the
|
||||
upstream repository:
|
||||
@ -198,8 +197,11 @@ upstream repository:
|
||||
git reset --hard ceph/main
|
||||
git push -u origin main
|
||||
|
||||
This procedure should be followed often, in order to keep your local ``main``
|
||||
in sync with upstream ``main``.
|
||||
Follow this procedure often to keep your local ``main`` in sync with upstream
|
||||
``main``.
|
||||
|
||||
If the command ``git status`` returns a line that reads "Untracked files", see
|
||||
:ref:`the procedure on updating submodules <update-submodules>`.
|
||||
|
||||
.. _bugfix_branch:
|
||||
|
||||
@ -230,15 +232,15 @@ your local working repository to your fork of the upstream repository.
|
||||
Fixing the bug in the local working copy
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
#. Updating the tracker
|
||||
#. **Updating the tracker**
|
||||
|
||||
In the `Ceph issue tracker <https://tracker.ceph.com>`_, change the status
|
||||
of the tracker issue to "In progress". This communicates to other Ceph
|
||||
contributors that you have begun working on a fix, which helps to avoid
|
||||
duplication of effort. If you don't have permission to change that field,
|
||||
your comment that you are working on the issue is sufficient.
|
||||
just comment that you are working on the issue.
|
||||
|
||||
#. Fixing the bug itself
|
||||
#. **Fixing the bug itself**
|
||||
|
||||
This guide cannot tell you how to fix the bug that you have chosen to fix.
|
||||
This guide assumes that you know what required improvement, and that you
|
||||
@ -280,13 +282,13 @@ believe that it works.
|
||||
|
||||
.. note::
|
||||
|
||||
In the command `git push origin fix_1`, `origin` is the name of your fork
|
||||
of the upstream Ceph repository, and can be thought of as a nickname for
|
||||
`git@github.com:username/ceph.git`, where `username` is your GitHub
|
||||
username.
|
||||
In the command ``git push origin fix_1``, ``origin`` is the name of your
|
||||
fork of the upstream Ceph repository, and can be thought of as a nickname
|
||||
for ``git@github.com:username/ceph.git``, where ``username`` is your
|
||||
GitHub username.
|
||||
|
||||
It is possible that `origin` is not the name of your fork. Discover the
|
||||
name of your fork by running `git remote -v`, as shown here:
|
||||
It is possible that ``origin`` is not the name of your fork. Discover the
|
||||
name of your fork by running ``git remote -v``, as shown here:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
@ -296,9 +298,16 @@ believe that it works.
|
||||
origin git@github.com:username/ceph.git (fetch)
|
||||
origin git@github.com:username/ceph.git (push)
|
||||
|
||||
The line "origin git@github.com:username/ceph.git (fetch)" and the line
|
||||
"origin git@github.com:username/ceph.git (push)" provide the information
|
||||
that "origin" is the name of your fork of the Ceph repository.
|
||||
The line::
|
||||
|
||||
origin git@github.com:username/ceph.git (fetch)
|
||||
|
||||
and the line::
|
||||
|
||||
origin git@github.com:username/ceph.git (push)
|
||||
|
||||
provide the information that "origin" is the name of your fork of the
|
||||
Ceph repository.
|
||||
|
||||
|
||||
Opening a GitHub pull request
|
||||
@ -507,3 +516,58 @@ the **ptl-tool** have the following form::
|
||||
client: move client_lock to _unmount()
|
||||
client: add timer_lock support
|
||||
Reviewed-by: Patrick Donnelly <pdonnell@redhat.com>
|
||||
|
||||
Miscellaneous
|
||||
-------------
|
||||
|
||||
--set-upstream
|
||||
^^^^^^^^^^^^^^
|
||||
|
||||
If you forget to include the ``--set-upstream origin x`` option in your ``git
|
||||
push`` command, you will see the following error message:
|
||||
|
||||
::
|
||||
|
||||
fatal: The current branch {x} has no upstream branch.
|
||||
To push the current branch and set the remote as upstream, use
|
||||
git push --set-upstream origin {x}
|
||||
|
||||
To set up git to automatically create the upstream branch that corresponds to
|
||||
the branch in your local working copy, run this command from within the
|
||||
``ceph/`` directory:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
git config --global push.autoSetupRemote true
|
||||
|
||||
Deleting a Branch Locally
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
To delete the branch named ``localBranchName`` from the local working copy, run
|
||||
a command of this form:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
git branch -d localBranchName
|
||||
|
||||
Deleting a Branch Remotely
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
To delete the branch named ``remoteBranchName`` from the remote upstream branch
|
||||
(which is also your fork of ``ceph/ceph``, as described in :ref:`forking`), run
|
||||
a command of this form:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
git push origin --delete remoteBranchName
|
||||
|
||||
Searching a File Longitudinally for a String
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
To search for the commit that introduced a given string (in this example, that
|
||||
string is ``foo``) into a given file (in this example, that file is
|
||||
``file.rst``), run a command of this form:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
git log -S 'foo' file.rst
|
||||
|
@ -215,8 +215,8 @@ The build process is based on `Node.js <https://nodejs.org/>`_ and requires the
|
||||
Prerequisites
|
||||
~~~~~~~~~~~~~
|
||||
|
||||
* Node 12.18.2 or higher
|
||||
* NPM 6.13.4 or higher
|
||||
* Node 14.15.0 or higher
|
||||
* NPM 6.14.9 or higher
|
||||
|
||||
nodeenv:
|
||||
During Ceph's build we create a virtualenv with ``node`` and ``npm``
|
||||
@ -290,7 +290,7 @@ HTML files:
|
||||
- `html-linter <https://github.com/chinchiheather/html-linter>`_
|
||||
- `htmllint-cli <https://github.com/htmllint/htmllint-cli>`_
|
||||
- `Prettier <https://prettier.io/>`_
|
||||
- `TSLint <https://palantir.github.io/tslint/>`_
|
||||
- `ESLint <https://eslint.org/>`_
|
||||
- `stylelint <https://stylelint.io/>`_
|
||||
|
||||
We added 2 npm scripts to help run these tools:
|
||||
@ -1039,8 +1039,8 @@ scenarios.
|
||||
|
||||
For example - ``throw new DashboardNotFoundError()``.
|
||||
|
||||
I18N
|
||||
----
|
||||
Internationalization (i18n)
|
||||
---------------------------
|
||||
|
||||
How to extract messages from source code?
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
@ -1204,6 +1204,92 @@ Keep elements that affect the sentence:
|
||||
<!-- recommended -->
|
||||
<span i18n>Profile <b>foo</b> will be removed.</span>
|
||||
|
||||
|
||||
.. _accessibility:
|
||||
|
||||
Accessibility
|
||||
-------------
|
||||
|
||||
Many parts of the Ceph Dashboard are modeled on `Web Content Accessibility Guidelines (WCAG) 2.1 <https://www.w3.org/TR/WCAG21/>`_ level A accessibility conformance guidelines.
|
||||
By implementing accessibility best practices, you are improving the usability of the Ceph Dashboard for blind and visually impaired users.
|
||||
|
||||
Summary
|
||||
~~~~~~~
|
||||
|
||||
A few things you should check before introducing a new code change include:
|
||||
|
||||
1) Add `ARIA labels and descriptions <https://www.w3.org/TR/wai-aria/>`_ to actionable HTML elements.
|
||||
2) Don't forget to tag ARIA labels/descriptions or any user-readable text for translation (i18n-title, i18n-aria-label...).
|
||||
3) Add `ARIA roles <https://www.w3.org/TR/wai-aria/#usage_intro>`_ to tag HTML elements that behave different from their intended behaviour (<a> tags behaving as <buttons>) or that provide extended behaviours (roles).
|
||||
4) Avoid poor `color contrast choices <https://www.w3.org/TR/WCAG21/#contrast-minimum>`_ (foreground-background) when styling a component. Here are some :ref:`tools <color-contrast-checkers>` you can use.
|
||||
5) When testing menus or dropdowns, be sure to scan them with an :ref:`accessibility checker <accessibility-checkers>` in both opened and closed states. Sometimes issues are hidden when menus are closed.
|
||||
|
||||
.. _accessibility-checkers:
|
||||
|
||||
Accessibility checkers
|
||||
~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
During development, you can test the accessibility compliance of your features using one of the tools below:
|
||||
|
||||
- `Accessibility insights plugin <https://accessibilityinsights.io/downloads/>`_
|
||||
- `Site Improve plugin <https://www.siteimprove.com/integrations/browser-extensions/>`_
|
||||
- `Axe devtools <https://www.deque.com/axe/devtools/>`_
|
||||
|
||||
Testing with two or more of these tools can greatly improve the detection of accessibility violations.
|
||||
|
||||
.. _color-contrast-checkers:
|
||||
|
||||
Color contrast checkers
|
||||
~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
When adding new colors, making sure they are accessible is also important. Here are some tools which can help with color contrast testing:
|
||||
|
||||
- `Accessible web color-contrast checker <https://accessibleweb.com/color-contrast-checker/>`_
|
||||
- `Colorsafe generator <https://colorsafe.co/>`_
|
||||
|
||||
Accessibility linters
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
If you use VSCode, you may install the `axe accessibility linter <https://marketplace.visualstudio.com/items?itemName=deque-systems.vscode-axe-linter>`_,
|
||||
which can help you catch and fix potential issues during development.
|
||||
|
||||
Accessibility testing
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Our e2e testing suite, which is based on Cypress, supports the addition of accessibility tests using `axe-core <https://github.com/dequelabs/axe-core>`_
|
||||
and `cypress-axe <https://github.com/component-driven/cypress-axe>`_. A custom Cypress command, `cy.checkAccessibility`, can also be used directly.
|
||||
This is a great way to prevent accessibility regressions on high impact components.
|
||||
|
||||
Tests can be found under the `a11y folder <./src/pybind/mgr/dashboard/frontend/cypress/integration/a11y>`_ in the dashboard. Here is an example:
|
||||
|
||||
.. code:: TypeScript
|
||||
|
||||
describe('Navigation accessibility', { retries: 0 }, () => {
|
||||
const shared = new NavigationPageHelper();
|
||||
|
||||
beforeEach(() => {
|
||||
cy.login();
|
||||
Cypress.Cookies.preserveOnce('token');
|
||||
shared.navigateTo();
|
||||
});
|
||||
|
||||
it('top-nav should have no accessibility violations', () => {
|
||||
cy.injectAxe();
|
||||
cy.checkAccessibility('.cd-navbar-top');
|
||||
});
|
||||
|
||||
it('sidebar should have no accessibility violations', () => {
|
||||
cy.injectAxe();
|
||||
cy.checkAccessibility('nav[id=sidebar]');
|
||||
});
|
||||
|
||||
});
|
||||
|
||||
Additional guidelines
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
If you're unsure about which UI pattern to follow in order to implement an accessibility fix, `patternfly <https://www.patternfly.org/v4/accessibility/accessibility-fundamentals>`_ guidelines can be used.
|
||||
|
||||
Backend Development
|
||||
-------------------
|
||||
|
||||
|
@ -89,6 +89,11 @@ click on `New issue`_.
|
||||
.. _`jump to the Ceph project`: http://tracker.ceph.com/projects/ceph
|
||||
.. _`New issue`: http://tracker.ceph.com/projects/ceph/issues/new
|
||||
|
||||
Slack
|
||||
-----
|
||||
|
||||
Ceph's Slack is https://ceph-storage.slack.com/.
|
||||
|
||||
.. _mailing-list:
|
||||
|
||||
Mailing lists
|
||||
|
@ -135,7 +135,7 @@ integration tests for all the Ceph components.
|
||||
- verify that teuthology can run integration tests, with and without OpenStack
|
||||
|
||||
* - `upgrade <https://github.com/ceph/ceph/tree/master/qa/suites/upgrade>`_
|
||||
- for various versions of Ceph, verify that upgrades can happen without disrupting an ongoing workload
|
||||
- for various versions of Ceph, verify that upgrades can happen without disrupting an ongoing workload (`Upgrade Testing`_)
|
||||
|
||||
teuthology-describe
|
||||
-------------------
|
||||
@ -449,6 +449,109 @@ A single test from the rbd/thrash suite can be run by adding the
|
||||
--suite rbd/thrash \
|
||||
--filter 'rbd/thrash/{clusters/fixed-2.yaml clusters/openstack.yaml workloads/rbd_api_tests_copy_on_read.yaml}'
|
||||
|
||||
.. _upgrade-testing:
|
||||
|
||||
Upgrade Testing
|
||||
^^^^^^^^^^^^^^^
|
||||
|
||||
Using the upgrade suite we are able to verify that upgrades from earlier releases can complete
|
||||
successfully without disrupting any ongoing workload.
|
||||
Each Release branch upgrade directory includes 2-x upgrade testing.
|
||||
Meaning, we are able to test the upgrade from 2 preceding releases to the current one.
|
||||
The upgrade sequence is done in `parallel <https://github.com/ceph/teuthology/blob/main/teuthology/task/parallel.py>`_
|
||||
with other given workloads.
|
||||
|
||||
For instance, the upgrade test directory from the Quincy release branch is as follows:
|
||||
|
||||
.. code-block:: none
|
||||
|
||||
.
|
||||
├── octopus-x
|
||||
└── pacific-x
|
||||
|
||||
It is possible to test upgrades from Octopus (2-x) or from Pacific (1-x) to Quincy (x).
|
||||
A simple upgrade test consists the following order:
|
||||
|
||||
.. code-block:: none
|
||||
|
||||
├── 0-start.yaml
|
||||
├── 1-tasks.yaml
|
||||
├── upgrade-sequence.yaml
|
||||
└── workload
|
||||
|
||||
After starting the cluster with the older release we begin running the given ``workload``
|
||||
and the ``upgrade-sequnce`` in parallel.
|
||||
|
||||
.. code-block:: yaml
|
||||
|
||||
- print: "**** done start parallel"
|
||||
- parallel:
|
||||
- workload
|
||||
- upgrade-sequence
|
||||
- print: "**** done end parallel"
|
||||
|
||||
While the ``workload`` directory consists regular yaml files just as in any other suite,
|
||||
the ``upgrade-sequnce`` is resposible for running the upgrade and awaitng its completion:
|
||||
|
||||
.. code-block:: yaml
|
||||
|
||||
- print: "**** done start upgrade, wait"
|
||||
...
|
||||
mon.a:
|
||||
- ceph orch upgrade start --image quay.ceph.io/ceph-ci/ceph:$sha1
|
||||
- while ceph orch upgrade status | jq '.in_progress' | grep true ; do ceph orch ps ; ceph versions ; sleep 30 ; done\
|
||||
...
|
||||
- print: "**** done end upgrade, wait..."
|
||||
|
||||
|
||||
It is also possible to upgrade in stages while running workloads in between those:
|
||||
|
||||
.. code-block:: none
|
||||
|
||||
├── %
|
||||
├── 0-cluster
|
||||
├── 1-ceph-install
|
||||
├── 2-partial-upgrade
|
||||
├── 3-thrash
|
||||
├── 4-workload
|
||||
├── 5-finish-upgrade.yaml
|
||||
├── 6-quincy.yaml
|
||||
└── 8-final-workload
|
||||
|
||||
After starting a cluster we upgrade only 2/3 of the cluster (``2-partial-upgrade``).
|
||||
The next stage is running thrash tests and given workload tests. Later on, continuing to upgrade the
|
||||
rest of the cluster (``5-finish-upgrade.yaml``).
|
||||
The last stage is requiring the updated release (``ceph require-osd-release quincy``,
|
||||
``ceph osd set-require-min-compat-client quincy``) and running the ``final-workload``.
|
||||
|
||||
Position Independent Linking
|
||||
----------------------------
|
||||
|
||||
Under the ``qa/suites`` directory are ``.qa`` symbolic links in every
|
||||
directory. Each link is recursive by always linking to ``../.qa/``. The final
|
||||
terminating link is in the ``qa/`` directory itself as ``qa/.qa -> .``. This
|
||||
layout of symbolic links allows a suite to be easily copied or moved without
|
||||
breaking a number of symbolic links. For example::
|
||||
|
||||
qa/suites/fs/upgrade/nofs/centos_latest.yaml -> .qa/distros/supported/centos_latest.yaml
|
||||
|
||||
If we copy the ``nofs`` suite somewhere else, add a parent directory above
|
||||
``nofs``, or move the ``centos_latest.yaml`` fragment into a sub-directory, the
|
||||
link will not break. Compare to::
|
||||
|
||||
qa/suites/fs/upgrade/nofs/centos_latest.yaml -> ../../../../distros/supported/centos_latest.yaml
|
||||
|
||||
If the link is moved, it is very likely it will break because the number of
|
||||
parent directories to reach the ``distros`` directory may change.
|
||||
|
||||
When adding new directories or suites, it is recommended to also remember
|
||||
adding ``.qa`` symbolic links. A trivial find command may do this for you:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
find qa/suites/ -type d -execdir ln -sfT ../.qa/ {}/.qa \;
|
||||
|
||||
|
||||
Filtering tests by their description
|
||||
------------------------------------
|
||||
|
||||
|
@ -166,5 +166,12 @@ Unit test caveats
|
||||
explicitly linked against something else. This enables tools such as
|
||||
**valgrind** to be used in the tests.
|
||||
|
||||
#. Google Test unit testing library hides the client output from the shell.
|
||||
In order to debug the client after setting the desired debug level
|
||||
(e.g ``ceph config set client debug_rbd 20``), the debug log file can
|
||||
be found at ``build/out/client.admin.<pid>.log``.
|
||||
This can also be handy when examining teuthology failed unit test
|
||||
jobs, the job's debug level can be set at the relevant yaml file.
|
||||
|
||||
.. _make check:
|
||||
.. _teuthology framework: https://github.com/ceph/teuthology
|
||||
|
@ -88,6 +88,10 @@ separate file, like this::
|
||||
|
||||
.. graphviz:: myfile.dot
|
||||
|
||||
See the `Dot User's Manual <https://www.graphviz.org/pdf/dotguide.pdf>`_ by
|
||||
Emden R. Gansner, Eleftherios Koutsofios, and Stephen North for examples of
|
||||
digraphs. This is especially useful if this is your first time encountering
|
||||
GraphViz.
|
||||
|
||||
Ditaa
|
||||
-----
|
||||
|
@ -4,10 +4,6 @@
|
||||
|
||||
.. graphviz::
|
||||
|
||||
/*
|
||||
* Rough outline of object store module dependencies
|
||||
*/
|
||||
|
||||
digraph object_store {
|
||||
size="7,7";
|
||||
node [color=lightblue2, style=filled, fontname="Serif"];
|
||||
@ -68,3 +64,4 @@
|
||||
|
||||
|
||||
.. todo:: write more here
|
||||
|
||||
|
@ -6,47 +6,52 @@ Glossary
|
||||
--------
|
||||
|
||||
*chunk*
|
||||
when the encoding function is called, it returns chunks of the same
|
||||
size. Data chunks which can be concatenated to reconstruct the original
|
||||
object and coding chunks which can be used to rebuild a lost chunk.
|
||||
When the encoding function is called, it returns chunks of the same
|
||||
size as each other. There are two kinds of chunks: (1) *data
|
||||
chunks*, which can be concatenated to reconstruct the original
|
||||
object, and (2) *coding chunks*, which can be used to rebuild a
|
||||
lost chunk.
|
||||
|
||||
*chunk rank*
|
||||
the index of a chunk when returned by the encoding function. The
|
||||
rank of the first chunk is 0, the rank of the second chunk is 1
|
||||
etc.
|
||||
|
||||
*stripe*
|
||||
when an object is too large to be encoded with a single call,
|
||||
each set of chunks created by a call to the encoding function is
|
||||
called a stripe.
|
||||
|
||||
*shard|strip*
|
||||
an ordered sequence of chunks of the same rank from the same
|
||||
object. For a given placement group, each OSD contains shards of
|
||||
the same rank. When dealing with objects that are encoded with a
|
||||
single operation, *chunk* is sometime used instead of *shard*
|
||||
because the shard is made of a single chunk. The *chunks* in a
|
||||
*shard* are ordered according to the rank of the stripe they belong
|
||||
to.
|
||||
The index of a chunk, as determined by the encoding function. The
|
||||
rank of the first chunk is 0, the rank of the second chunk is 1,
|
||||
and so on.
|
||||
|
||||
*K*
|
||||
the number of data *chunks*, i.e. the number of *chunks* in which the
|
||||
original object is divided. For instance if *K* = 2 a 10KB object
|
||||
will be divided into *K* objects of 5KB each.
|
||||
The number of data chunks into which an object is divided. For
|
||||
example, if *K* = 2, then a 10KB object is divided into two objects
|
||||
of 5KB each.
|
||||
|
||||
*M*
|
||||
the number of coding *chunks*, i.e. the number of additional *chunks*
|
||||
computed by the encoding functions. If there are 2 coding *chunks*,
|
||||
it means 2 OSDs can be out without losing data.
|
||||
The number of coding chunks computed by the encoding function. *M*
|
||||
is equal to the number of OSDs that can be missing from the cluster
|
||||
without the cluster suffering data loss. For example, if there are
|
||||
two coding chunks, then two OSDs can be missing without data loss.
|
||||
|
||||
*N*
|
||||
the number of data *chunks* plus the number of coding *chunks*,
|
||||
i.e. *K+M*.
|
||||
The number of data chunks plus the number of coding chunks: that
|
||||
is, *K* + *M*.
|
||||
|
||||
*rate*
|
||||
the proportion of the *chunks* that contains useful information, i.e. *K/N*.
|
||||
For instance, for *K* = 9 and *M* = 3 (i.e. *K+M* = *N* = 12) the rate is
|
||||
*K* = 9 / *N* = 12 = 0.75, i.e. 75% of the chunks contain useful information.
|
||||
The proportion of the total chunks containing useful information:
|
||||
that is, *K* divided by *N*. For example, suppose that *K* = 9 and
|
||||
*M* = 3. This would mean that *N* = 12 (because *K* + *M* = 9 + 3).
|
||||
Therefore, the *rate* (*K* / *N*) would be 9 / 12 = 0.75. In other
|
||||
words, 75% of the chunks would contain useful information.
|
||||
|
||||
*shard* (also called *strip*)
|
||||
An ordered sequence of chunks of the same rank from the same object. For a
|
||||
given placement group, each OSD contains shards of the same rank. In the
|
||||
special case in which an object is encoded with only one call to the
|
||||
encoding function, the term *chunk* may be used instead of *shard* because
|
||||
the shard is made of a single chunk. The chunks in a shard are ordered
|
||||
according to the rank of the stripe (see *stripe* below) they belong to.
|
||||
|
||||
|
||||
*stripe*
|
||||
If an object is so large that encoding it requires more than one
|
||||
call to the encoding function, each of these calls creates a set of
|
||||
chunks called a *stripe*.
|
||||
|
||||
The definitions are illustrated as follows (PG stands for placement group):
|
||||
::
|
||||
@ -71,8 +76,8 @@ The definitions are illustrated as follows (PG stands for placement group):
|
||||
| ... | | ... |
|
||||
+-------------------------+ +-------------------------+
|
||||
|
||||
Table of content
|
||||
----------------
|
||||
Table of contents
|
||||
-----------------
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
|
@ -2,198 +2,396 @@
|
||||
Ceph Glossary
|
||||
===============
|
||||
|
||||
Ceph is growing rapidly. As firms deploy Ceph, the technical terms such as
|
||||
"RADOS", "RBD," "RGW" and so forth require corresponding marketing terms
|
||||
that explain what each component does. The terms in this glossary are
|
||||
intended to complement the existing technical terminology.
|
||||
|
||||
Sometimes more than one term applies to a definition. Generally, the first
|
||||
term reflects a term consistent with Ceph's marketing, and secondary terms
|
||||
reflect either technical terms or legacy ways of referring to Ceph systems.
|
||||
|
||||
|
||||
.. glossary::
|
||||
|
||||
Ceph Project
|
||||
The aggregate term for the people, software, mission and infrastructure
|
||||
of Ceph.
|
||||
Application
|
||||
More properly called a :term:`client`, an application is any program
|
||||
external to Ceph that uses a Ceph Cluster to store and
|
||||
replicate data.
|
||||
|
||||
cephx
|
||||
The Ceph authentication protocol. Cephx operates like Kerberos, but it
|
||||
has no single point of failure.
|
||||
:ref:`BlueStore<rados_config_storage_devices_bluestore>`
|
||||
OSD BlueStore is a storage back end used by OSD daemons, and
|
||||
was designed specifically for use with Ceph. BlueStore was
|
||||
introduced in the Ceph Kraken release. In the Ceph Luminous
|
||||
release, BlueStore became Ceph's default storage back end,
|
||||
supplanting FileStore. Unlike :term:`filestore`, BlueStore
|
||||
stores objects directly on Ceph block devices without any file
|
||||
system interface. Since Luminous (12.2), BlueStore has been
|
||||
Ceph's default and recommended storage back end.
|
||||
|
||||
Bucket
|
||||
In the context of :term:`RGW`, a bucket is a group of objects.
|
||||
In a filesystem-based analogy in which objects are the
|
||||
counterpart of files, buckets are the counterpart of
|
||||
directories. :ref:`Multisite sync
|
||||
policies<radosgw-multisite-sync-policy>` can be set on buckets,
|
||||
to provide fine-grained control of data movement from one zone
|
||||
to another zone.
|
||||
|
||||
The concept of the bucket has been taken from AWS S3. See also
|
||||
`the AWS S3 page on creating buckets <https://docs.aws.amazon.com/AmazonS3/latest/userguide/creating-buckets-s3.html>`_
|
||||
and `the AWS S3 'Buckets Overview' page <https://docs.aws.amazon.com/AmazonS3/latest/userguide/UsingBucket.html>`_.
|
||||
|
||||
OpenStack Swift uses the term "containers" for what RGW and AWS call "buckets".
|
||||
See `the OpenStack Storage API overview page <https://docs.openstack.org/swift/latest/api/object_api_v1_overview.html>`_.
|
||||
|
||||
Ceph
|
||||
Ceph is a distributed network storage and file system with
|
||||
distributed metadata management and POSIX semantics.
|
||||
|
||||
Ceph Block Device
|
||||
A software instrument that orchestrates the storage of
|
||||
block-based data in Ceph. Ceph Block Device (also called "RBD",
|
||||
or "RADOS block device") splits block-based application data
|
||||
into "chunks". RADOS stores these chunks as objects. Ceph Block
|
||||
Device orchestrates the storage of those objects across the
|
||||
storage cluster. See also :term:`RBD`.
|
||||
|
||||
Ceph Block Storage
|
||||
One of the three kinds of storage supported by Ceph (the other
|
||||
two are object storage and file storage). Ceph Block Storage is
|
||||
the block storage "product", which refers to block-storage
|
||||
related services and capabilities when used in conjunction with
|
||||
the collection of (1) ``librbd`` (a python module that provides
|
||||
file-like access to :term:`RBD` images), (2) a hypervisor such
|
||||
as QEMU or Xen, and (3) a hypervisor abstraction layer such as
|
||||
``libvirt``.
|
||||
|
||||
Ceph Client
|
||||
Any of the Ceph components that can access a Ceph Storage
|
||||
Cluster. This includes the Ceph Object Gateway, the Ceph Block
|
||||
Device, the Ceph File System, and their corresponding
|
||||
libraries. It also includes kernel modules, and FUSEs
|
||||
(Filesystems in USERspace).
|
||||
|
||||
Ceph Client Libraries
|
||||
The collection of libraries that can be used to interact with
|
||||
components of the Ceph Cluster.
|
||||
|
||||
Ceph Cluster Map
|
||||
See :term:`Cluster Map`
|
||||
|
||||
Ceph Dashboard
|
||||
:ref:`The Ceph Dashboard<mgr-dashboard>` is a built-in
|
||||
web-based Ceph management and monitoring application through
|
||||
which you can inspect and administer various resources within
|
||||
the cluster. It is implemented as a :ref:`ceph-manager-daemon`
|
||||
module.
|
||||
|
||||
Ceph File System
|
||||
See :term:`CephFS`
|
||||
|
||||
:ref:`CephFS<ceph-file-system>`
|
||||
The **Ceph F**\ile **S**\ystem, or CephFS, is a
|
||||
POSIX-compliant file system built on top of Ceph’s distributed
|
||||
object store, RADOS. See :ref:`CephFS Architecture
|
||||
<arch-cephfs>` for more details.
|
||||
|
||||
Ceph Interim Release
|
||||
See :term:`Releases`.
|
||||
|
||||
Ceph Kernel Modules
|
||||
The collection of kernel modules that can be used to interact
|
||||
with the Ceph Cluster (for example: ``ceph.ko``, ``rbd.ko``).
|
||||
|
||||
:ref:`Ceph Manager<ceph-manager-daemon>`
|
||||
The Ceph manager daemon (ceph-mgr) is a daemon that runs
|
||||
alongside monitor daemons to provide monitoring and interfacing
|
||||
to external monitoring and management systems. Since the
|
||||
Luminous release (12.x), no Ceph cluster functions properly
|
||||
unless it contains a running ceph-mgr daemon.
|
||||
|
||||
Ceph Manager Dashboard
|
||||
See :term:`Ceph Dashboard`.
|
||||
|
||||
Ceph Metadata Server
|
||||
See :term:`MDS`.
|
||||
|
||||
Ceph Monitor
|
||||
A daemon that maintains a map of the state of the cluster. This
|
||||
"cluster state" includes the monitor map, the manager map, the
|
||||
OSD map, and the CRUSH map. A Ceph cluster must contain a
|
||||
minimum of three running monitors in order to be both redundant
|
||||
and highly-available. Ceph monitors and the nodes on which they
|
||||
run are often referred to as "mon"s. See :ref:`Monitor Config
|
||||
Reference <monitor-config-reference>`.
|
||||
|
||||
Ceph Node
|
||||
A Ceph node is a unit of the Ceph Cluster that communicates with
|
||||
other nodes in the Ceph Cluster in order to replicate and
|
||||
redistribute data. All of the nodes together are called the
|
||||
:term:`Ceph Storage Cluster`. Ceph nodes include :term:`OSD`\s,
|
||||
:term:`Ceph Monitor`\s, :term:`Ceph Manager`\s, and
|
||||
:term:`MDS`\es. The term "node" is usually equivalent to "host"
|
||||
in the Ceph documentation. If you have a running Ceph Cluster,
|
||||
you can list all of the nodes in it by running the command
|
||||
``ceph node ls all``.
|
||||
|
||||
:ref:`Ceph Object Gateway<object-gateway>`
|
||||
An object storage interface built on top of librados. Ceph
|
||||
Object Gateway provides a RESTful gateway between applications
|
||||
and Ceph storage clusters.
|
||||
|
||||
Ceph Object Storage
|
||||
See :term:`Ceph Object Store`.
|
||||
|
||||
Ceph Object Store
|
||||
A Ceph Object Store consists of a :term:`Ceph Storage Cluster`
|
||||
and a :term:`Ceph Object Gateway` (RGW).
|
||||
|
||||
:ref:`Ceph OSD<rados_configuration_storage-devices_ceph_osd>`
|
||||
Ceph **O**\bject **S**\torage **D**\aemon. The Ceph OSD
|
||||
software, which interacts with logical disks (:term:`OSD`).
|
||||
Around 2013, there was an attempt by "research and industry"
|
||||
(Sage's own words) to insist on using the term "OSD" to mean
|
||||
only "Object Storage Device", but the Ceph community has always
|
||||
persisted in using the term to mean "Object Storage Daemon" and
|
||||
no less an authority than Sage Weil himself confirms in
|
||||
November of 2022 that "Daemon is more accurate for how Ceph is
|
||||
built" (private correspondence between Zac Dover and Sage Weil,
|
||||
07 Nov 2022).
|
||||
|
||||
Ceph OSD Daemon
|
||||
See :term:`Ceph OSD`.
|
||||
|
||||
Ceph OSD Daemons
|
||||
See :term:`Ceph OSD`.
|
||||
|
||||
Ceph Platform
|
||||
All Ceph software, which includes any piece of code hosted at
|
||||
`https://github.com/ceph`_.
|
||||
|
||||
Ceph System
|
||||
Ceph Point Release
|
||||
See :term:`Releases`.
|
||||
|
||||
Ceph Project
|
||||
The aggregate term for the people, software, mission and
|
||||
infrastructure of Ceph.
|
||||
|
||||
Ceph Release
|
||||
See :term:`Releases`.
|
||||
|
||||
Ceph Release Candidate
|
||||
See :term:`Releases`.
|
||||
|
||||
Ceph Stable Release
|
||||
See :term:`Releases`.
|
||||
|
||||
Ceph Stack
|
||||
A collection of two or more components of Ceph.
|
||||
|
||||
Ceph Node
|
||||
Node
|
||||
Host
|
||||
Any single machine or server in a Ceph System.
|
||||
:ref:`Ceph Storage Cluster<arch-ceph-storage-cluster>`
|
||||
The collection of :term:`Ceph Monitor`\s, :term:`Ceph
|
||||
Manager`\s, :term:`Ceph Metadata Server`\s, and :term:`OSD`\s
|
||||
that work together to store and replicate data for use by
|
||||
applications, Ceph Users, and :term:`Ceph Client`\s. Ceph
|
||||
Storage Clusters receive data from :term:`Ceph Client`\s.
|
||||
|
||||
Ceph Storage Cluster
|
||||
Ceph Object Store
|
||||
RADOS
|
||||
RADOS Cluster
|
||||
Reliable Autonomic Distributed Object Store
|
||||
The core set of storage software which stores the user's data (MON+OSD).
|
||||
cephx
|
||||
The Ceph authentication protocol. Cephx operates like Kerberos,
|
||||
but it has no single point of failure.
|
||||
|
||||
Ceph Cluster Map
|
||||
Cluster Map
|
||||
The set of maps comprising the monitor map, OSD map, PG map, MDS map and
|
||||
CRUSH map. See `Cluster Map`_ for details.
|
||||
|
||||
Ceph Object Storage
|
||||
The object storage "product", service or capabilities, which consists
|
||||
essentially of a Ceph Storage Cluster and a Ceph Object Gateway.
|
||||
|
||||
Ceph Object Gateway
|
||||
RADOS Gateway
|
||||
RGW
|
||||
The S3/Swift gateway component of Ceph.
|
||||
|
||||
Ceph Block Device
|
||||
RBD
|
||||
The block storage component of Ceph.
|
||||
|
||||
Ceph Block Storage
|
||||
The block storage "product," service or capabilities when used in
|
||||
conjunction with ``librbd``, a hypervisor such as QEMU or Xen, and a
|
||||
hypervisor abstraction layer such as ``libvirt``.
|
||||
|
||||
Ceph File System
|
||||
CephFS
|
||||
Ceph FS
|
||||
The POSIX filesystem components of Ceph. Refer
|
||||
:ref:`CephFS Architecture <arch-cephfs>` and :ref:`ceph-file-system` for
|
||||
more details.
|
||||
Client
|
||||
A client is any program external to Ceph that uses a Ceph
|
||||
Cluster to store and replicate data.
|
||||
|
||||
Cloud Platforms
|
||||
Cloud Stacks
|
||||
Third party cloud provisioning platforms such as OpenStack, CloudStack,
|
||||
OpenNebula, Proxmox VE, etc.
|
||||
Third party cloud provisioning platforms such as OpenStack,
|
||||
CloudStack, OpenNebula, and Proxmox VE.
|
||||
|
||||
Object Storage Device
|
||||
OSD
|
||||
A physical or logical storage unit (*e.g.*, LUN).
|
||||
Sometimes, Ceph users use the
|
||||
term "OSD" to refer to :term:`Ceph OSD Daemon`, though the
|
||||
proper term is "Ceph OSD".
|
||||
Cluster Map
|
||||
The set of maps consisting of the monitor map, OSD map, PG map,
|
||||
MDS map, and CRUSH map, which together report the state of the
|
||||
Ceph cluster. See :ref:`the "Cluster Map" section of the
|
||||
Architecture document<architecture_cluster_map>` for details.
|
||||
|
||||
Ceph OSD Daemon
|
||||
Ceph OSD Daemons
|
||||
Ceph OSD
|
||||
The Ceph OSD software, which interacts with a logical
|
||||
disk (:term:`OSD`). Sometimes, Ceph users use the
|
||||
term "OSD" to refer to "Ceph OSD Daemon", though the
|
||||
proper term is "Ceph OSD".
|
||||
CRUSH
|
||||
Controlled Replication Under Scalable Hashing. It is the
|
||||
algorithm Ceph uses to compute object storage locations.
|
||||
|
||||
OSD id
|
||||
The integer that defines an OSD. It is generated by the monitors as part
|
||||
of the creation of a new OSD.
|
||||
CRUSH rule
|
||||
The CRUSH data placement rule that applies to a particular
|
||||
pool(s).
|
||||
|
||||
OSD fsid
|
||||
This is a unique identifier used to further improve the uniqueness of an
|
||||
OSD and it is found in the OSD path in a file called ``osd_fsid``. This
|
||||
``fsid`` term is used interchangeably with ``uuid``
|
||||
DAS
|
||||
**D**\irect-\ **A**\ttached **S**\torage. Storage that is
|
||||
attached directly to the computer accessing it, without passing
|
||||
through a network. Contrast with NAS and SAN.
|
||||
|
||||
OSD uuid
|
||||
Just like the OSD fsid, this is the OSD unique identifier and is used
|
||||
interchangeably with ``fsid``
|
||||
:ref:`Dashboard<mgr-dashboard>`
|
||||
A built-in web-based Ceph management and monitoring application
|
||||
to administer various aspects and objects of the cluster. The
|
||||
dashboard is implemented as a Ceph Manager module. See
|
||||
:ref:`mgr-dashboard` for more details.
|
||||
|
||||
bluestore
|
||||
OSD BlueStore is a new back end for OSD daemons (kraken and newer
|
||||
versions). Unlike :term:`filestore` it stores objects directly on the
|
||||
Ceph block devices without any file system interface.
|
||||
Dashboard Module
|
||||
Another name for :term:`Dashboard`.
|
||||
|
||||
Dashboard Plugin
|
||||
filestore
|
||||
A back end for OSD daemons, where a Journal is needed and files are
|
||||
written to the filesystem.
|
||||
A back end for OSD daemons, where a Journal is needed and files
|
||||
are written to the filesystem.
|
||||
|
||||
FQDN
|
||||
**F**\ully **Q**\ualified **D**\omain **N**\ame. A domain name
|
||||
that is applied to a node in a network and that specifies the
|
||||
node's exact location in the tree hierarchy of the DNS.
|
||||
|
||||
In the context of Ceph cluster administration, FQDNs are often
|
||||
applied to hosts. In this documentation, the term "FQDN" is
|
||||
used mostly to distinguish between FQDNs and relatively simpler
|
||||
hostnames, which do not specify the exact location of the host
|
||||
in the tree hierarchy of the DNS but merely name the host.
|
||||
|
||||
Host
|
||||
Any single machine or server in a Ceph Cluster. See :term:`Ceph
|
||||
Node`.
|
||||
|
||||
LVM tags
|
||||
Extensible metadata for LVM volumes and groups. It is used to
|
||||
store Ceph-specific information about devices and its
|
||||
relationship with OSDs.
|
||||
|
||||
:ref:`MDS<cephfs_add_remote_mds>`
|
||||
The Ceph **M**\eta\ **D**\ata **S**\erver daemon. Also referred
|
||||
to as "ceph-mds". The Ceph metadata server daemon must be
|
||||
running in any Ceph cluster that runs the CephFS file system.
|
||||
The MDS stores all filesystem metadata.
|
||||
|
||||
MGR
|
||||
The Ceph manager software, which collects all the state from
|
||||
the whole cluster in one place.
|
||||
|
||||
Ceph Monitor
|
||||
MON
|
||||
The Ceph monitor software.
|
||||
|
||||
Ceph Manager
|
||||
MGR
|
||||
The Ceph manager software, which collects all the state from the whole
|
||||
cluster in one place.
|
||||
Node
|
||||
See :term:`Ceph Node`.
|
||||
|
||||
Ceph Manager Dashboard
|
||||
Ceph Dashboard
|
||||
Dashboard Module
|
||||
Dashboard Plugin
|
||||
Dashboard
|
||||
A built-in web-based Ceph management and monitoring application to
|
||||
administer various aspects and objects of the cluster. The dashboard is
|
||||
implemented as a Ceph Manager module. See :ref:`mgr-dashboard` for more
|
||||
details.
|
||||
Object Storage Device
|
||||
See :term:`OSD`.
|
||||
|
||||
Ceph Metadata Server
|
||||
MDS
|
||||
The Ceph metadata software.
|
||||
OSD
|
||||
Probably :term:`Ceph OSD`, but not necessarily. Sometimes
|
||||
(especially in older correspondence, and especially in
|
||||
documentation that is not written specifically for Ceph), "OSD"
|
||||
means "**O**\bject **S**\torage **D**\evice", which refers to a
|
||||
physical or logical storage unit (for example: LUN). The Ceph
|
||||
community has always used the term "OSD" to refer to
|
||||
:term:`Ceph OSD Daemon` despite an industry push in the
|
||||
mid-2010s to insist that "OSD" should refer to "Object Storage
|
||||
Device", so it is important to know which meaning is intended.
|
||||
|
||||
Ceph Clients
|
||||
Ceph Client
|
||||
The collection of Ceph components which can access a Ceph Storage
|
||||
Cluster. These include the Ceph Object Gateway, the Ceph Block Device,
|
||||
the Ceph File System, and their corresponding libraries, kernel modules,
|
||||
and FUSEs.
|
||||
OSD fsid
|
||||
This is a unique identifier used to identify an OSD. It is
|
||||
found in the OSD path in a file called ``osd_fsid``. The
|
||||
term ``fsid`` is used interchangeably with ``uuid``
|
||||
|
||||
Ceph Kernel Modules
|
||||
The collection of kernel modules which can be used to interact with the
|
||||
Ceph System (e.g., ``ceph.ko``, ``rbd.ko``).
|
||||
OSD id
|
||||
The integer that defines an OSD. It is generated by the
|
||||
monitors during the creation of each OSD.
|
||||
|
||||
Ceph Client Libraries
|
||||
The collection of libraries that can be used to interact with components
|
||||
of the Ceph System.
|
||||
OSD uuid
|
||||
This is the unique identifier of an OSD. This term is used
|
||||
interchangeably with ``fsid``
|
||||
|
||||
Period
|
||||
In the context of :term:`RGW`, a period is the configuration
|
||||
state of the :term:`Realm`. The period stores the configuration
|
||||
state of a multi-site configuration. When the period is updated,
|
||||
the "epoch" is said thereby to have been changed.
|
||||
|
||||
:ref:`Pool<rados_pools>`
|
||||
A pool is a logical partition used to store objects.
|
||||
|
||||
Pools
|
||||
See :term:`pool`.
|
||||
|
||||
RADOS
|
||||
**R**\eliable **A**\utonomic **D**\istributed **O**\bject
|
||||
**S**\tore. RADOS is the object store that provides a scalable
|
||||
service for variably-sized objects. The RADOS object store is
|
||||
the core component of a Ceph cluster. `This blog post from
|
||||
2009
|
||||
<https://ceph.io/en/news/blog/2009/the-rados-distributed-object-store/>`_
|
||||
provides a beginner's introduction to RADOS. Readers interested
|
||||
in a deeper understanding of RADOS are directed to `RADOS: A
|
||||
Scalable, Reliable Storage Service for Petabyte-scale Storage
|
||||
Clusters <https://ceph.io/assets/pdfs/weil-rados-pdsw07.pdf>`_.
|
||||
|
||||
RADOS Cluster
|
||||
A proper subset of the Ceph Cluster consisting of
|
||||
:term:`OSD`\s, :term:`Ceph Monitor`\s, and :term:`Ceph
|
||||
Manager`\s.
|
||||
|
||||
RADOS Gateway
|
||||
See :term:`RGW`.
|
||||
|
||||
RBD
|
||||
The block storage component of Ceph. Also called "RADOS Block
|
||||
Device" or :term:`Ceph Block Device`.
|
||||
|
||||
:ref:`Realm<rgw-realms>`
|
||||
In the context of RADOS Gateway (RGW), a realm is a globally
|
||||
unique namespace that consists of one or more zonegroups.
|
||||
|
||||
Releases
|
||||
|
||||
Ceph Interim Release
|
||||
A version of Ceph that has not yet been put through
|
||||
quality assurance testing. May contain new features.
|
||||
|
||||
Ceph Point Release
|
||||
Any ad hoc release that includes only bug fixes and
|
||||
security fixes.
|
||||
|
||||
Ceph Release
|
||||
Any distinct numbered version of Ceph.
|
||||
|
||||
Ceph Point Release
|
||||
Any ad-hoc release that includes only bug or security fixes.
|
||||
|
||||
Ceph Interim Release
|
||||
Versions of Ceph that have not yet been put through quality assurance
|
||||
testing, but may contain new features.
|
||||
|
||||
Ceph Release Candidate
|
||||
A major version of Ceph that has undergone initial quality assurance
|
||||
testing and is ready for beta testers.
|
||||
A major version of Ceph that has undergone initial
|
||||
quality assurance testing and is ready for beta
|
||||
testers.
|
||||
|
||||
Ceph Stable Release
|
||||
A major version of Ceph where all features from the preceding interim
|
||||
releases have been put through quality assurance testing successfully.
|
||||
A major version of Ceph where all features from the
|
||||
preceding interim releases have been put through
|
||||
quality assurance testing successfully.
|
||||
|
||||
Reliable Autonomic Distributed Object Store
|
||||
The core set of storage software which stores the user's data
|
||||
(MON+OSD). See also :term:`RADOS`.
|
||||
|
||||
:ref:`RGW<object-gateway>`
|
||||
**R**\ADOS **G**\ate **W**\ay.
|
||||
|
||||
The component of Ceph that provides a gateway to both the
|
||||
Amazon S3 RESTful API and the OpenStack Swift API. Also called
|
||||
"RADOS Gateway" and "Ceph Object Gateway".
|
||||
|
||||
secrets
|
||||
Secrets are credentials used to perform digital authentication
|
||||
whenever privileged users must access systems that require
|
||||
authentication. Secrets can be passwords, API keys, tokens, SSH
|
||||
keys, private certificates, or encryption keys.
|
||||
|
||||
SDS
|
||||
Software-defined storage.
|
||||
|
||||
systemd oneshot
|
||||
A systemd ``type`` where a command is defined in ``ExecStart``
|
||||
which will exit upon completion (it is not intended to
|
||||
daemonize)
|
||||
|
||||
Ceph Test Framework
|
||||
Teuthology
|
||||
The collection of software that performs scripted tests on Ceph.
|
||||
|
||||
CRUSH
|
||||
Controlled Replication Under Scalable Hashing. It is the algorithm
|
||||
Ceph uses to compute object storage locations.
|
||||
|
||||
CRUSH rule
|
||||
The CRUSH data placement rule that applies to a particular pool(s).
|
||||
|
||||
Pool
|
||||
Pools
|
||||
Pools are logical partitions for storing objects.
|
||||
|
||||
systemd oneshot
|
||||
A systemd ``type`` where a command is defined in ``ExecStart`` which will
|
||||
exit upon completion (it is not intended to daemonize)
|
||||
|
||||
LVM tags
|
||||
Extensible metadata for LVM volumes and groups. It is used to store
|
||||
Ceph-specific information about devices and its relationship with
|
||||
OSDs.
|
||||
Zone
|
||||
In the context of :term:`RGW`, a zone is a logical group that
|
||||
consists of one or more :term:`RGW` instances. A zone's
|
||||
configuration state is stored in the :term:`period`. See
|
||||
:ref:`Zones<radosgw-zones>`.
|
||||
|
||||
.. _https://github.com/ceph: https://github.com/ceph
|
||||
.. _Cluster Map: ../architecture#cluster-map
|
||||
|
File diff suppressed because it is too large
Load Diff
Before Width: | Height: | Size: 568 KiB After Width: | Height: | Size: 730 KiB |
@ -2,8 +2,7 @@
|
||||
Welcome to Ceph
|
||||
=================
|
||||
|
||||
Ceph uniquely delivers **object, block, and file storage in one unified
|
||||
system**.
|
||||
Ceph delivers **object, block, and file storage in one unified system**.
|
||||
|
||||
.. warning::
|
||||
|
||||
@ -112,8 +111,8 @@ about Ceph, see our `Architecture`_ section.
|
||||
governance
|
||||
foundation
|
||||
ceph-volume/index
|
||||
releases/general
|
||||
releases/index
|
||||
Ceph Releases (general) <https://docs.ceph.com/en/latest/releases/general/>
|
||||
Ceph Releases (index) <https://docs.ceph.com/en/latest/releases/>
|
||||
security/index
|
||||
Glossary <glossary>
|
||||
Tracing <jaegertracing/index>
|
||||
|
@ -2,33 +2,37 @@
|
||||
Cloning the Ceph Source Code Repository
|
||||
=========================================
|
||||
|
||||
You may clone a Ceph branch of the Ceph source code by going to `github Ceph
|
||||
Repository`_, selecting a branch (``master`` by default), and clicking the
|
||||
**Download ZIP** button.
|
||||
To clone a Ceph branch of the Ceph source code, go to `github Ceph
|
||||
Repository`_, select a branch (``main`` by default), and click the **Download
|
||||
ZIP** button.
|
||||
|
||||
.. _github Ceph Repository: https://github.com/ceph/ceph
|
||||
|
||||
To clone the entire git repository, :ref:`install <install-git>` and configure
|
||||
``git``.
|
||||
|
||||
To clone the entire git repository, install and configure ``git``.
|
||||
|
||||
.. _install-git:
|
||||
|
||||
Install Git
|
||||
===========
|
||||
|
||||
To install ``git`` on Debian/Ubuntu, execute::
|
||||
To install ``git`` on Debian/Ubuntu, run the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
sudo apt-get install git
|
||||
|
||||
|
||||
To install ``git`` on CentOS/RHEL, execute::
|
||||
To install ``git`` on CentOS/RHEL, run the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
sudo yum install git
|
||||
|
||||
|
||||
You must also have a ``github`` account. If you do not have a
|
||||
``github`` account, go to `github.com`_ and register.
|
||||
Follow the directions for setting up git at
|
||||
`Set Up Git`_.
|
||||
You must have a ``github`` account. If you do not have a ``github``
|
||||
account, go to `github.com`_ and register. Follow the directions for setting
|
||||
up git at `Set Up Git`_.
|
||||
|
||||
.. _github.com: https://github.com
|
||||
.. _Set Up Git: https://help.github.com/linux-set-up-git
|
||||
@ -37,26 +41,31 @@ Follow the directions for setting up git at
|
||||
Add SSH Keys (Optional)
|
||||
=======================
|
||||
|
||||
If you intend to commit code to Ceph or to clone using SSH
|
||||
To commit code to Ceph or to clone the respository by using SSH
|
||||
(``git@github.com:ceph/ceph.git``), you must generate SSH keys for github.
|
||||
|
||||
.. tip:: If you only intend to clone the repository, you may
|
||||
.. tip:: If you want only to clone the repository, you can
|
||||
use ``git clone --recursive https://github.com/ceph/ceph.git``
|
||||
without generating SSH keys.
|
||||
|
||||
To generate SSH keys for ``github``, execute::
|
||||
To generate SSH keys for ``github``, run the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ssh-keygen
|
||||
|
||||
Get the key to add to your ``github`` account (the following example
|
||||
assumes you used the default file path)::
|
||||
To print the SSH key that you just generated and that you will add to your
|
||||
``github`` account, use the ``cat`` command. (The following example assumes you
|
||||
used the default file path.):
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
cat .ssh/id_rsa.pub
|
||||
|
||||
Copy the public key.
|
||||
|
||||
Go to your ``github`` account, click on "Account Settings" (i.e., the
|
||||
'tools' icon); then, click "SSH Keys" on the left side navbar.
|
||||
Go to your ``github`` account, click "Account Settings" (represented by the
|
||||
'tools' icon), and click "SSH Keys" on the left side navbar.
|
||||
|
||||
Click "Add SSH key" in the "SSH Keys" list, enter a name for the key, paste the
|
||||
key you generated, and press the "Add key" button.
|
||||
@ -65,37 +74,122 @@ key you generated, and press the "Add key" button.
|
||||
Clone the Source
|
||||
================
|
||||
|
||||
To clone the Ceph source code repository, execute::
|
||||
To clone the Ceph source code repository, run the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
git clone --recursive https://github.com/ceph/ceph.git
|
||||
|
||||
Once ``git clone`` executes, you should have a full copy of the Ceph
|
||||
After ``git clone`` has run, you should have a full copy of the Ceph
|
||||
repository.
|
||||
|
||||
.. tip:: Make sure you maintain the latest copies of the submodules
|
||||
included in the repository. Running ``git status`` will tell you if
|
||||
the submodules are out of date.
|
||||
.. tip:: Make sure you maintain the latest copies of the submodules included in
|
||||
the repository. Running ``git status`` will tell you whether the submodules
|
||||
are out of date. See :ref:`update-submodules` for more information.
|
||||
|
||||
::
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
cd ceph
|
||||
git status
|
||||
|
||||
If your submodules are out of date, run::
|
||||
.. _update-submodules:
|
||||
|
||||
Updating Submodules
|
||||
-------------------
|
||||
|
||||
#. Determine whether your submodules are out of date:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
git status
|
||||
|
||||
A. If your submodules are up to date
|
||||
If your submodules are up to date, the following console output will
|
||||
appear:
|
||||
|
||||
::
|
||||
|
||||
On branch main
|
||||
Your branch is up to date with 'origin/main'.
|
||||
|
||||
nothing to commit, working tree clean
|
||||
|
||||
If you see this console output, then your submodules are up to date.
|
||||
You do not need this procedure.
|
||||
|
||||
|
||||
B. If your submodules are not up to date
|
||||
If your submodules are not up to date, you will see a message that
|
||||
includes a list of "untracked files". The example here shows such a
|
||||
list, which was generated from a real situation in which the
|
||||
submodules were no longer current. Your list of files will not be the
|
||||
same as this list of files, but this list is provided as an example.
|
||||
If in your case any untracked files are listed, then you should
|
||||
continue to the next step of this procedure.
|
||||
|
||||
::
|
||||
|
||||
On branch main
|
||||
Your branch is up to date with 'origin/main'.
|
||||
|
||||
Untracked files:
|
||||
(use "git add <file>..." to include in what will be committed)
|
||||
src/pybind/cephfs/build/
|
||||
src/pybind/cephfs/cephfs.c
|
||||
src/pybind/cephfs/cephfs.egg-info/
|
||||
src/pybind/rados/build/
|
||||
src/pybind/rados/rados.c
|
||||
src/pybind/rados/rados.egg-info/
|
||||
src/pybind/rbd/build/
|
||||
src/pybind/rbd/rbd.c
|
||||
src/pybind/rbd/rbd.egg-info/
|
||||
src/pybind/rgw/build/
|
||||
src/pybind/rgw/rgw.c
|
||||
src/pybind/rgw/rgw.egg-info/
|
||||
|
||||
nothing added to commit but untracked files present (use "git add" to track)
|
||||
|
||||
#. If your submodules are out of date, run the following commands:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
git submodule update --force --init --recursive
|
||||
git clean -fdx
|
||||
git submodule foreach git clean -fdx
|
||||
|
||||
If you still have problems with a submodule directory, use ``rm -rf
|
||||
[directory name]`` to remove the directory. Then run ``git submodule update
|
||||
--init --recursive`` again.
|
||||
|
||||
#. Run ``git status`` again:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
git status
|
||||
|
||||
Your submodules are up to date if you see the following message:
|
||||
|
||||
::
|
||||
|
||||
On branch main
|
||||
Your branch is up to date with 'origin/main'.
|
||||
|
||||
nothing to commit, working tree clean
|
||||
|
||||
Choose a Branch
|
||||
===============
|
||||
|
||||
Once you clone the source code and submodules, your Ceph repository
|
||||
will be on the ``master`` branch by default, which is the unstable
|
||||
will be on the ``main`` branch by default, which is the unstable
|
||||
development branch. You may choose other branches too.
|
||||
|
||||
- ``master``: The unstable development branch.
|
||||
- ``stable``: The bugfix branch.
|
||||
- ``main``: The unstable development branch.
|
||||
- ``stable-release-name``: The name of the stable, `Active Releases`_. e.g. ``Pacific``
|
||||
- ``next``: The release candidate branch.
|
||||
|
||||
::
|
||||
|
||||
git checkout master
|
||||
git checkout main
|
||||
|
||||
.. _Active Releases: https://docs.ceph.com/en/latest/releases/#active-releases
|
||||
|
@ -19,7 +19,7 @@ Ceph Container Images
|
||||
Official Releases
|
||||
-----------------
|
||||
|
||||
Ceph Container images are available from both Quay and Docker Hub::
|
||||
Ceph Container images are available from Quay:
|
||||
|
||||
https://quay.io/repository/ceph/ceph
|
||||
https://hub.docker.com/r/ceph
|
||||
|
@ -120,7 +120,7 @@ For RPMs::
|
||||
|
||||
https://download.ceph.com/rpm-{version}
|
||||
|
||||
The major releases of Ceph are summarized at: :ref:`ceph-releases-general`
|
||||
The major releases of Ceph are summarized at: `Releases`_
|
||||
|
||||
.. tip:: For non-US users: There might be a mirror close to you where
|
||||
to download Ceph from. For more information see: `Ceph Mirrors`_.
|
||||
@ -376,6 +376,7 @@ line to get the short codename.
|
||||
|
||||
|
||||
|
||||
.. _Releases: https://docs.ceph.com/en/latest/releases/
|
||||
.. _the testing Debian repository: https://download.ceph.com/debian-testing/dists
|
||||
.. _the shaman page: https://shaman.ceph.com
|
||||
.. _Ceph Mirrors: ../mirrors
|
||||
|
@ -4,20 +4,19 @@
|
||||
Installing Ceph
|
||||
===============
|
||||
|
||||
There are several different ways to install Ceph. Choose the
|
||||
method that best suits your needs.
|
||||
There are multiple ways to install Ceph.
|
||||
|
||||
Recommended methods
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
:ref:`Cephadm <cephadm>` installs and manages a Ceph cluster using containers and
|
||||
systemd, with tight integration with the CLI and dashboard GUI.
|
||||
:ref:`Cephadm <cephadm_deploying_new_cluster>` installs and manages a Ceph
|
||||
cluster that uses containers and systemd and is tightly integrated with the CLI
|
||||
and dashboard GUI.
|
||||
|
||||
* cephadm only supports Octopus and newer releases.
|
||||
* cephadm is fully integrated with the new orchestration API and
|
||||
fully supports the new CLI and dashboard features to manage
|
||||
cluster deployment.
|
||||
* cephadm requires container support (podman or docker) and
|
||||
* cephadm supports only Octopus and newer releases.
|
||||
* cephadm is fully integrated with the orchestration API and fully supports the
|
||||
CLI and dashboard features that are used to manage cluster deployment.
|
||||
* cephadm requires container support (in the form of Podman or Docker) and
|
||||
Python 3.
|
||||
|
||||
`Rook <https://rook.io/>`_ deploys and manages Ceph clusters running
|
||||
@ -25,12 +24,12 @@ in Kubernetes, while also enabling management of storage resources and
|
||||
provisioning via Kubernetes APIs. We recommend Rook as the way to run Ceph in
|
||||
Kubernetes or to connect an existing Ceph storage cluster to Kubernetes.
|
||||
|
||||
* Rook only supports Nautilus and newer releases of Ceph.
|
||||
* Rook supports only Nautilus and newer releases of Ceph.
|
||||
* Rook is the preferred method for running Ceph on Kubernetes, or for
|
||||
connecting a Kubernetes cluster to an existing (external) Ceph
|
||||
cluster.
|
||||
* Rook supports the new orchestrator API. New management features
|
||||
in the CLI and dashboard are fully supported.
|
||||
* Rook supports the orchestrator API. Management features in the CLI and
|
||||
dashboard are fully supported.
|
||||
|
||||
Other methods
|
||||
~~~~~~~~~~~~~
|
||||
@ -39,16 +38,20 @@ Other methods
|
||||
Ceph clusters using Ansible.
|
||||
|
||||
* ceph-ansible is widely deployed.
|
||||
* ceph-ansible is not integrated with the new orchestrator APIs,
|
||||
introduced in Nautlius and Octopus, which means that newer
|
||||
management features and dashboard integration are not available.
|
||||
* ceph-ansible is not integrated with the orchestrator APIs that were
|
||||
introduced in Nautilus and Octopus, which means that the management features
|
||||
and dashboard integration introduced in Nautilus and Octopus are not
|
||||
available in Ceph clusters deployed by means of ceph-ansible.
|
||||
|
||||
|
||||
`ceph-deploy <https://docs.ceph.com/projects/ceph-deploy/en/latest/>`_ is a tool for quickly deploying clusters.
|
||||
`ceph-deploy <https://docs.ceph.com/projects/ceph-deploy/en/latest/>`_ is a
|
||||
tool that can be used to quickly deploy clusters. It is deprecated.
|
||||
|
||||
.. IMPORTANT::
|
||||
|
||||
ceph-deploy is no longer actively maintained. It is not tested on versions of Ceph newer than Nautilus. It does not support RHEL8, CentOS 8, or newer operating systems.
|
||||
ceph-deploy is not actively maintained. It is not tested on versions of Ceph
|
||||
newer than Nautilus. It does not support RHEL8, CentOS 8, or newer operating
|
||||
systems.
|
||||
|
||||
`ceph-salt <https://github.com/ceph/ceph-salt>`_ installs Ceph using Salt and cephadm.
|
||||
|
||||
@ -67,7 +70,7 @@ Ceph can also be :ref:`installed manually <install-manual>`.
|
||||
Windows
|
||||
~~~~~~~
|
||||
|
||||
For Windows installations, please consult this document:
|
||||
For Windows installations, consult this document:
|
||||
`Windows installation guide`_.
|
||||
|
||||
.. _Windows installation guide: ./windows-install
|
||||
|
@ -1,5 +1,7 @@
|
||||
:orphan:
|
||||
|
||||
.. _ceph_osd-daemon:
|
||||
|
||||
========================================
|
||||
ceph-osd -- ceph object storage daemon
|
||||
========================================
|
||||
|
@ -16,15 +16,10 @@ Synopsis
|
||||
Description
|
||||
===========
|
||||
|
||||
**ceph-rbdnamer** prints the pool and image name for the given RBD devices
|
||||
to stdout. It is used by `udev` (using a rule like the one below) to
|
||||
set up a device symlink.
|
||||
|
||||
|
||||
::
|
||||
|
||||
KERNEL=="rbd[0-9]*", PROGRAM="/usr/bin/ceph-rbdnamer %n", SYMLINK+="rbd/%c{1}/%c{2}"
|
||||
|
||||
**ceph-rbdnamer** prints the pool, namespace, image and snapshot names
|
||||
for a given RBD device to stdout. It is used by `udev` device manager
|
||||
to set up RBD device symlinks. The appropriate `udev` rules are
|
||||
provided in a file named `50-rbd.rules`.
|
||||
|
||||
Availability
|
||||
============
|
||||
|
@ -108,6 +108,16 @@ pools; it only runs simulations by mapping values in the range
|
||||
shows that value **24** is mapped to devices **[11,6]** by rule
|
||||
**1**.
|
||||
|
||||
One of the following is required when using the ``--show-mappings`` option:
|
||||
|
||||
(a) ``--num-rep``
|
||||
(b) both ``--min-rep`` and ``--max-rep``
|
||||
|
||||
``--num-rep`` stands for "number of replicas, indicates the number of
|
||||
replicas in a pool, and is used to specify an exact number of replicas (for
|
||||
example ``--num-rep 5``). ``--min-rep`` and ``--max-rep`` are used together
|
||||
to specify a range of replicas (for example, ``--min-rep 1 --max-rep 10``).
|
||||
|
||||
.. option:: --show-bad-mappings
|
||||
|
||||
Displays which value failed to be mapped to the required number of
|
||||
|
@ -22,6 +22,9 @@ the real work. To mount a Ceph file system use::
|
||||
|
||||
mount.ceph name@07fe3187-00d9-42a3-814b-72a4d5e7d5be.fs_name=/ /mnt/mycephfs -o mon_addr=1.2.3.4
|
||||
|
||||
where "name" is the RADOS client name (referred to hereafter as "RADOS user",
|
||||
and meaning any individual or system actor such as an application).
|
||||
|
||||
Mount helper can fill in the cluster FSID by reading the ceph configuration file.
|
||||
Its recommended to call the mount helper via mount(8) as per::
|
||||
|
||||
@ -113,6 +116,12 @@ Basic
|
||||
them. If an inode contains any stale file locks, read/write on the inode
|
||||
is not allowed until applications release all stale file locks.
|
||||
|
||||
:command: `fs=<fs-name>`
|
||||
Specify the non-default file system to be mounted, when using the old syntax.
|
||||
|
||||
:command: `mds_namespace=<fs-name>`
|
||||
A synonym of "fs=" (Deprecated).
|
||||
|
||||
Advanced
|
||||
--------
|
||||
:command:`cap_release_safety`
|
||||
@ -226,6 +235,10 @@ If authentication is disabled on Ceph cluster, omit the credential related optio
|
||||
|
||||
mount.ceph fs_user@.mycephfs2=/ /mnt/mycephfs
|
||||
|
||||
To mount using the old syntax::
|
||||
|
||||
mount -t ceph 192.168.0.1:/ /mnt/mycephfs
|
||||
|
||||
Availability
|
||||
============
|
||||
|
||||
|
@ -743,6 +743,10 @@ Options
|
||||
|
||||
The placement target index type (normal, indexless, or #id).
|
||||
|
||||
.. option:: --placement-inline-data=<true>
|
||||
|
||||
Whether the placement target is configured to store a data chunk inline in head objects.
|
||||
|
||||
.. option:: --tier-type=<type>
|
||||
|
||||
The zone tier type.
|
||||
|
@ -9,7 +9,7 @@
|
||||
Synopsis
|
||||
========
|
||||
|
||||
| **rbd-nbd** [-c conf] [--read-only] [--device *nbd device*] [--nbds_max *limit*] [--max_part *limit*] [--exclusive] [--notrim] [--encryption-format *format*] [--encryption-passphrase-file *passphrase-file*] [--io-timeout *seconds*] [--reattach-timeout *seconds*] map *image-spec* | *snap-spec*
|
||||
| **rbd-nbd** [-c conf] [--read-only] [--device *nbd device*] [--snap-id *snap-id*] [--nbds_max *limit*] [--max_part *limit*] [--exclusive] [--notrim] [--encryption-format *format*] [--encryption-passphrase-file *passphrase-file*] [--io-timeout *seconds*] [--reattach-timeout *seconds*] map *image-spec* | *snap-spec*
|
||||
| **rbd-nbd** unmap *nbd device* | *image-spec* | *snap-spec*
|
||||
| **rbd-nbd** list-mapped
|
||||
| **rbd-nbd** attach --device *nbd device* *image-spec* | *snap-spec*
|
||||
@ -71,6 +71,10 @@ Options
|
||||
attached after the old process is detached. The default is 30
|
||||
second.
|
||||
|
||||
.. option:: --snap-id *snapid*
|
||||
|
||||
Specify a snapshot to map/unmap/attach/detach by ID instead of by name.
|
||||
|
||||
Image and snap specs
|
||||
====================
|
||||
|
||||
|
@ -258,7 +258,7 @@ Commands
|
||||
Show the rbd images that are mapped via the rbd kernel module
|
||||
(default) or other supported device.
|
||||
|
||||
:command:`device map` [-t | --device-type *device-type*] [--cookie *device-cookie*] [--show-cookie] [--read-only] [--exclusive] [-o | --options *device-options*] *image-spec* | *snap-spec*
|
||||
:command:`device map` [-t | --device-type *device-type*] [--cookie *device-cookie*] [--show-cookie] [--snap-id *snap-id*] [--read-only] [--exclusive] [-o | --options *device-options*] *image-spec* | *snap-spec*
|
||||
Map the specified image to a block device via the rbd kernel module
|
||||
(default) or other supported device (*nbd* on Linux or *ggate* on
|
||||
FreeBSD).
|
||||
@ -266,14 +266,14 @@ Commands
|
||||
The --options argument is a comma separated list of device type
|
||||
specific options (opt1,opt2=val,...).
|
||||
|
||||
:command:`device unmap` [-t | --device-type *device-type*] [-o | --options *device-options*] *image-spec* | *snap-spec* | *device-path*
|
||||
:command:`device unmap` [-t | --device-type *device-type*] [-o | --options *device-options*] [--snap-id *snap-id*] *image-spec* | *snap-spec* | *device-path*
|
||||
Unmap the block device that was mapped via the rbd kernel module
|
||||
(default) or other supported device.
|
||||
|
||||
The --options argument is a comma separated list of device type
|
||||
specific options (opt1,opt2=val,...).
|
||||
|
||||
:command:`device attach` [-t | --device-type *device-type*] --device *device-path* [--cookie *device-cookie*] [--show-cookie] [--read-only] [--exclusive] [--force] [-o | --options *device-options*] *image-spec* | *snap-spec*
|
||||
:command:`device attach` [-t | --device-type *device-type*] --device *device-path* [--cookie *device-cookie*] [--show-cookie] [--snap-id *snap-id*] [--read-only] [--exclusive] [--force] [-o | --options *device-options*] *image-spec* | *snap-spec*
|
||||
Attach the specified image to the specified block device (currently only
|
||||
`nbd` on Linux). This operation is unsafe and should not be normally used.
|
||||
In particular, specifying the wrong image or the wrong block device may
|
||||
@ -282,7 +282,7 @@ Commands
|
||||
The --options argument is a comma separated list of device type
|
||||
specific options (opt1,opt2=val,...).
|
||||
|
||||
:command:`device detach` [-t | --device-type *device-type*] [-o | --options *device-options*] *image-spec* | *snap-spec* | *device-path*
|
||||
:command:`device detach` [-t | --device-type *device-type*] [-o | --options *device-options*] [--snap-id *snap-id*] *image-spec* | *snap-spec* | *device-path*
|
||||
Detach the block device that was mapped or attached (currently only `nbd`
|
||||
on Linux). This operation is unsafe and should not be normally used.
|
||||
|
||||
|
@ -561,8 +561,10 @@ on appropriate hosts, proceed with the following steps.
|
||||
services run on a manager host will be restarted automatically on a different
|
||||
manager host if one Ceph Manager goes down.
|
||||
|
||||
#. Add Prometheus as data source to Grafana `using the Grafana Web UI
|
||||
<https://grafana.com/docs/grafana/latest/features/datasources/add-a-data-source/>`_.
|
||||
#. Add Prometheus as data source to Grafana `using the Grafana Web UI <https://grafana.com/docs/grafana/latest/features/datasources/add-a-data-source/>`_.
|
||||
|
||||
.. IMPORTANT::
|
||||
The data source must be named "Dashboard1".
|
||||
|
||||
#. Install the `vonage-status-panel and grafana-piechart-panel` plugins using:
|
||||
|
||||
@ -1233,6 +1235,23 @@ code of standby dashboards. To do so you need to run the command:
|
||||
|
||||
ceph config set mgr mgr/dashboard/standby_error_status_code 503
|
||||
|
||||
Resolve IP address to hostname before redirect
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
The redirect from a standby to the active dashboard is done via the IP
|
||||
address. This is done because resolving IP addresses to hostnames can be error
|
||||
prone in containerized environments. It is also the reason why the option is
|
||||
disabled by default.
|
||||
However, in some situations it might be helpful to redirect via the hostname.
|
||||
For example if the configured TLS certificate matches only the hostnames. To
|
||||
activate the redirection via the hostname run the following command::
|
||||
|
||||
$ ceph config set mgr mgr/dashboard/redirect_resolve_ip_addr True
|
||||
|
||||
You can disable it again by::
|
||||
|
||||
$ ceph config set mgr mgr/dashboard/redirect_resolve_ip_addr False
|
||||
|
||||
HAProxy example configuration
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
|
@ -108,9 +108,13 @@ following commands::
|
||||
Exposing commands
|
||||
-----------------
|
||||
|
||||
There are two approaches for exposing a command. The first one is to
|
||||
use the ``@CLICommand`` decorator to decorate the method which handles
|
||||
the command. like this
|
||||
There are two approaches for exposing a command. The first method involves using
|
||||
the ``@CLICommand`` decorator to decorate the methods needed to handle a command.
|
||||
The second method uses a ``COMMANDS`` attribute defined for the module class.
|
||||
|
||||
|
||||
The CLICommand approach
|
||||
~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
.. code:: python
|
||||
|
||||
@ -131,7 +135,7 @@ the command. like this
|
||||
else:
|
||||
location = blackhole
|
||||
self.send_object_to(obj, location)
|
||||
return HandleCommandResult(stdout=f'the black hole swallowed '{oid}'")
|
||||
return HandleCommandResult(stdout=f"the black hole swallowed '{oid}'")
|
||||
|
||||
The first parameter passed to ``CLICommand`` is the "name" of the command.
|
||||
Since there are lots of commands in Ceph, we tend to group related commands
|
||||
@ -164,7 +168,11 @@ In addition to ``@CLICommand``, you could also use ``@CLIReadCommand`` or
|
||||
``@CLIWriteCommand`` if your command only requires read permissions or
|
||||
write permissions respectively.
|
||||
|
||||
The second one is to set the ``COMMANDS`` class attribute of your module to
|
||||
|
||||
The COMMANDS Approach
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
This method uses the ``COMMANDS`` class attribute of your module to define
|
||||
a list of dicts like this::
|
||||
|
||||
COMMANDS = [
|
||||
@ -197,6 +205,192 @@ when they are sent:
|
||||
.. py:currentmodule:: mgr_module
|
||||
.. automethod:: MgrModule.handle_command
|
||||
|
||||
|
||||
Responses and Formatting
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Functions that handle manager commands are expected to return a three element
|
||||
tuple with the type signature ``Tuple[int, str, str]``. The first element is a
|
||||
return value/error code, where zero indicates no error and a negative `errno`_
|
||||
is typically used for error conditions. The second element corresponds to the
|
||||
command's "output". The third element corresponds to the command's "error
|
||||
output" (akin to stderr) and is frequently used to report textual error details
|
||||
when the return code is non-zero. The ``mgr_module.HandleCommandResult`` type
|
||||
can also be used in lieu of a response tuple.
|
||||
|
||||
.. _`errno`: https://man7.org/linux/man-pages/man3/errno.3.html
|
||||
|
||||
When the implementation of a command raises an exception one of two possible
|
||||
approaches to handling the exception exist. First, the command function can do
|
||||
nothing and let the exception bubble up to the manager. When this happens the
|
||||
manager will automatically set a return code to -EINVAL and record a trace-back
|
||||
in the error output. This trace-back can be very long in some cases. The second
|
||||
approach is to handle an exception within a try-except block and convert the
|
||||
exception to an error code that better fits the exception (converting a
|
||||
KeyError to -ENOENT, for example). In this case the error output may also be
|
||||
set to something more specific and actionable by the one calling the command.
|
||||
|
||||
In many cases, especially in more recent versions of Ceph, manager commands are
|
||||
designed to return structured output to the caller. Structured output includes
|
||||
machine-parsable data such as JSON, YAML, XML, etc. JSON is the most common
|
||||
structured output format returned by manager commands. As of Ceph Reef, there
|
||||
are a number of new decorators available from the ``object_format`` module that
|
||||
help manage formatting output and handling exceptions automatically. The
|
||||
intent is that most of the implementation of a manager command can be written in
|
||||
an idiomatic (aka "Pythonic") style and the decorators will take care of most of
|
||||
the work needed to format the output and return manager response tuples.
|
||||
|
||||
In most cases, net new code should use the ``Responder`` decorator. Example:
|
||||
|
||||
.. code:: python
|
||||
|
||||
@CLICommand('antigravity list wormholes', perm='r')
|
||||
@Responder()
|
||||
def list_wormholes(self, oid: str, details: bool = False) -> List[Dict[str, Any]]:
|
||||
'''List wormholes associated with the supplied oid.
|
||||
'''
|
||||
with self.open_wormhole_db() as db:
|
||||
wormholes = db.query(oid=oid)
|
||||
if not details:
|
||||
return [{'name': wh.name} for wh in wormholes]
|
||||
return [{'name': wh.name, 'age': wh.get_age(), 'destination': wh.dest}
|
||||
for wh in wormholes]
|
||||
|
||||
Formatting
|
||||
++++++++++
|
||||
|
||||
The ``Responder`` decorator automatically takes care of converting Python
|
||||
objects into a response tuple with formatted output. By default, this decorator
|
||||
can automatically return JSON and YAML. When invoked from the command line the
|
||||
``--format`` flag can be used to select the response format. If left
|
||||
unspecified, JSON will be returned. The automatic formatting can be applied to
|
||||
any basic Python type: lists, dicts, str, int, etc. Other objects can be
|
||||
formatted automatically if they meet the ``SimpleDataProvider`` protocol - they
|
||||
provide a ``to_simplified`` method. The ``to_simplified`` function must return
|
||||
a simplified representation of the object made out of basic types.
|
||||
|
||||
.. code:: python
|
||||
|
||||
class MyCleverObject:
|
||||
def to_simplified(self) -> Dict[str, int]:
|
||||
# returns a python object(s) made up from basic types
|
||||
return {"gravitons": 999, "tachyons": 404}
|
||||
|
||||
@CLICommand('antigravity list wormholes', perm='r')
|
||||
@Responder()
|
||||
def list_wormholes(self, oid: str, details: bool = False) -> MyCleverObject:
|
||||
'''List wormholes associated with the supplied oid.
|
||||
'''
|
||||
...
|
||||
|
||||
The behavior of the automatic output formatting can be customized and extednted
|
||||
to other types of formatting (XML, Plain Text, etc). As this is a complex
|
||||
topic, please refer to the module documentation for the ``object_format``
|
||||
module.
|
||||
|
||||
|
||||
|
||||
Error Handling
|
||||
++++++++++++++
|
||||
|
||||
Additionally, the ``Responder`` decorator can automatically handle converting
|
||||
some exceptions into response tuples. Any raised exception inheriting from
|
||||
``ErrorResponseBase`` will be automatically converted into a response tuple.
|
||||
The common approach will be to use ``ErrorResponse``, an exception type that
|
||||
can be used directly and has arguments for the error output and return value or
|
||||
it can be constructed from an existing exception using the ``wrap``
|
||||
classmethod. The wrap classmethod will automatically use the exception text and
|
||||
if available the ``errno`` property of other exceptions.
|
||||
|
||||
Converting our previous example to use this exception handling approach:
|
||||
|
||||
.. code:: python
|
||||
|
||||
@CLICommand('antigravity list wormholes', perm='r')
|
||||
@Responder()
|
||||
def list_wormholes(self, oid: str, details: bool = False) -> List[Dict[str, Any]]:
|
||||
'''List wormholes associated with the supplied oid.
|
||||
'''
|
||||
try:
|
||||
with self.open_wormhole_db() as db:
|
||||
wormholes = db.query(oid=oid)
|
||||
except UnknownOIDError:
|
||||
raise ErrorResponse(f"Unknown oid: {oid}", return_value=-errno.ENOENT)
|
||||
except WormholeDBError as err:
|
||||
raise ErrorResponse.wrap(err)
|
||||
if not details:
|
||||
return [{'name': wh.name} for wh in wormholes]
|
||||
return [{'name': wh.name, 'age': wh.get_age(), 'destination': wh.dest}
|
||||
for wh in wormholes]
|
||||
|
||||
|
||||
.. note:: Because the decorator can not determine the difference between a
|
||||
programming mistake and an expected error condition it does not try to
|
||||
catch all exceptions.
|
||||
|
||||
|
||||
|
||||
Additional Decorators
|
||||
+++++++++++++++++++++
|
||||
|
||||
The ``object_format`` module provides additional decorators to complement
|
||||
``Responder`` but for cases where ``Responder`` is insufficient or too "heavy
|
||||
weight".
|
||||
|
||||
The ``ErrorResponseHandler`` decorator exists for cases where you *must* still
|
||||
return a manager response tuple but want to handle errors as exceptions (as in
|
||||
typical Python code). In short, it works like ``Responder`` but only with
|
||||
regards to exceptions. Just like ``Responder`` it handles exceptions that
|
||||
inherit from ``ErrorResponseBase``. This can be useful in cases where you need
|
||||
to return raw data in the output. Example:
|
||||
|
||||
.. code:: python
|
||||
|
||||
@CLICommand('antigravity dump config', perm='r')
|
||||
@ErrorResponseHandler()
|
||||
def dump_config(self, oid: str) -> Tuple[int, str, str]:
|
||||
'''Dump configuration
|
||||
'''
|
||||
# we have no control over what data is inside the blob!
|
||||
try:
|
||||
blob = self.fetch_raw_config_blob(oid)
|
||||
return 0, blob, ''
|
||||
except KeyError:
|
||||
raise ErrorResponse("Blob does not exist", return_value=-errno.ENOENT)
|
||||
|
||||
|
||||
The ``EmptyResponder`` decorator exists for cases where, on a success
|
||||
condition, no output should be generated at all. If you used ``Responder`` and
|
||||
default JSON formatting you may always see outputs like ``{}`` or ``[]`` if the
|
||||
command completes without error. Instead, ``EmptyResponder`` helps you create
|
||||
manager commands that obey the `Rule of Silence`_ when the command has no
|
||||
interesting output to emit on success. The functions that ``EmptyResponder``
|
||||
decorate should always return ``None``. Like both ``Responder`` and
|
||||
``ErrorResponseHandler`` exceptions that inhert from ``ErrorResponseBase`` will
|
||||
be automatically processed. Example:
|
||||
|
||||
.. code:: python
|
||||
|
||||
@CLICommand('antigravity create wormhole', perm='rw')
|
||||
@EmptyResponder()
|
||||
def create_wormhole(self, oid: str, name: str) -> None:
|
||||
'''Create a new wormhole.
|
||||
'''
|
||||
try:
|
||||
with self.open_wormhole_db() as db:
|
||||
wh = Wormhole(name)
|
||||
db.insert(oid=oid, wormhole=wh)
|
||||
except UnknownOIDError:
|
||||
raise ErrorResponse(f"Unknown oid: {oid}", return_value=-errno.ENOENT)
|
||||
except InvalidWormholeError as err:
|
||||
raise ErrorResponse.wrap(err)
|
||||
except WormholeDBError as err:
|
||||
raise ErrorResponse.wrap(err)
|
||||
|
||||
|
||||
.. _`Rule of Silence`: http://www.linfo.org/rule_of_silence.html
|
||||
|
||||
|
||||
Configuration options
|
||||
---------------------
|
||||
|
||||
@ -314,6 +508,7 @@ function. This will result in a circular locking exception.
|
||||
.. automethod:: MgrModule.get_perf_schema
|
||||
.. automethod:: MgrModule.get_counter
|
||||
.. automethod:: MgrModule.get_mgr_id
|
||||
.. automethod:: MgrModule.get_daemon_health_metrics
|
||||
|
||||
Exposing health checks
|
||||
----------------------
|
||||
|
@ -239,7 +239,7 @@ Create CephFS Export
|
||||
|
||||
.. code:: bash
|
||||
|
||||
$ ceph nfs export create cephfs --cluster-id <cluster_id> --pseudo-path <pseudo_path> --fsname <fsname> [--readonly] [--path=/path/in/cephfs] [--client_addr <value>...] [--squash <value>]
|
||||
$ ceph nfs export create cephfs --cluster-id <cluster_id> --pseudo-path <pseudo_path> --fsname <fsname> [--readonly] [--path=/path/in/cephfs] [--client_addr <value>...] [--squash <value>] [--sectype <value>...]
|
||||
|
||||
This creates export RADOS objects containing the export block, where
|
||||
|
||||
@ -266,6 +266,18 @@ for permissible values.
|
||||
value is `no_root_squash`. See the `NFS-Ganesha Export Sample`_ for
|
||||
permissible values.
|
||||
|
||||
``<sectype>`` specifies which authentication methods will be used when
|
||||
connecting to the export. Valid values include "krb5p", "krb5i", "krb5", "sys",
|
||||
and "none". More than one value can be supplied. The flag may be specified
|
||||
multiple times (example: ``--sectype=krb5p --sectype=krb5i``) or multiple
|
||||
values may be separated by a comma (example: ``--sectype krb5p,krb5i``). The
|
||||
server will negotatiate a supported security type with the client preferring
|
||||
the supplied methods left-to-right.
|
||||
|
||||
.. note:: Specifying values for sectype that require Kerberos will only function on servers
|
||||
that are configured to support Kerberos. Setting up NFS-Ganesha to support Kerberos
|
||||
is outside the scope of this document.
|
||||
|
||||
.. note:: Export creation is supported only for NFS Ganesha clusters deployed using nfs interface.
|
||||
|
||||
Create RGW Export
|
||||
@ -285,7 +297,7 @@ To export a *bucket*:
|
||||
|
||||
.. code::
|
||||
|
||||
$ ceph nfs export create rgw --cluster-id <cluster_id> --pseudo-path <pseudo_path> --bucket <bucket_name> [--user-id <user-id>] [--readonly] [--client_addr <value>...] [--squash <value>]
|
||||
$ ceph nfs export create rgw --cluster-id <cluster_id> --pseudo-path <pseudo_path> --bucket <bucket_name> [--user-id <user-id>] [--readonly] [--client_addr <value>...] [--squash <value>] [--sectype <value>...]
|
||||
|
||||
For example, to export *mybucket* via NFS cluster *mynfs* at the pseudo-path */bucketdata* to any host in the ``192.168.10.0/24`` network
|
||||
|
||||
@ -316,6 +328,18 @@ for permissible values.
|
||||
value is `no_root_squash`. See the `NFS-Ganesha Export Sample`_ for
|
||||
permissible values.
|
||||
|
||||
``<sectype>`` specifies which authentication methods will be used when
|
||||
connecting to the export. Valid values include "krb5p", "krb5i", "krb5", "sys",
|
||||
and "none". More than one value can be supplied. The flag may be specified
|
||||
multiple times (example: ``--sectype=krb5p --sectype=krb5i``) or multiple
|
||||
values may be separated by a comma (example: ``--sectype krb5p,krb5i``). The
|
||||
server will negotatiate a supported security type with the client preferring
|
||||
the supplied methods left-to-right.
|
||||
|
||||
.. note:: Specifying values for sectype that require Kerberos will only function on servers
|
||||
that are configured to support Kerberos. Setting up NFS-Ganesha to support Kerberos
|
||||
is outside the scope of this document.
|
||||
|
||||
RGW user export
|
||||
^^^^^^^^^^^^^^^
|
||||
|
||||
|
@ -184,17 +184,18 @@ List all collections with::
|
||||
ceph telemetry collection ls
|
||||
|
||||
NAME STATUS DESC
|
||||
basic_base REPORTING Basic information about the cluster (capacity, number and type of daemons, version, etc.)
|
||||
basic_base NOT REPORTING: NOT OPTED-IN Basic information about the cluster (capacity, number and type of daemons, version, etc.)
|
||||
basic_mds_metadata NOT REPORTING: NOT OPTED-IN MDS metadata
|
||||
basic_pool_options_bluestore NOT REPORTING: NOT OPTED-IN Per-pool bluestore config options
|
||||
basic_pool_usage NOT REPORTING: NOT OPTED-IN Default pool application and usage statistics
|
||||
basic_rook_v01 NOT REPORTING: NOT OPTED-IN Basic Rook deployment data
|
||||
basic_usage_by_class NOT REPORTING: NOT OPTED-IN Default device class usage statistics
|
||||
crash_base REPORTING Information about daemon crashes (daemon type and version, backtrace, etc.)
|
||||
device_base REPORTING Information about device health metrics
|
||||
ident_base NOT REPORTING: CHANNEL ident IS OFF User-provided identifying information about the cluster
|
||||
crash_base NOT REPORTING: NOT OPTED-IN Information about daemon crashes (daemon type and version, backtrace, etc.)
|
||||
device_base NOT REPORTING: NOT OPTED-IN Information about device health metrics
|
||||
ident_base NOT REPORTING: NOT OPTED-IN, CHANNEL ident IS OFF User-provided identifying information about the cluster
|
||||
perf_memory_metrics NOT REPORTING: NOT OPTED-IN, CHANNEL perf IS OFF Heap stats and mempools for mon and mds
|
||||
perf_perf NOT REPORTING: NOT OPTED-IN, CHANNEL perf IS OFF Information about performance counters of the cluster
|
||||
|
||||
|
||||
Where:
|
||||
|
||||
**NAME**: Collection name; prefix indicates the channel the collection belongs to.
|
||||
|
@ -6,7 +6,7 @@
|
||||
|
||||
The :term:`Ceph Storage Cluster` has a messaging layer protocol that enables
|
||||
clients to interact with a :term:`Ceph Monitor` and a :term:`Ceph OSD Daemon`.
|
||||
``librados`` provides this functionality to :term:`Ceph Clients` in the form of
|
||||
``librados`` provides this functionality to :term:`Ceph Client`\s in the form of
|
||||
a library. All Ceph Clients either use ``librados`` or the same functionality
|
||||
encapsulated in ``librados`` to interact with the object store. For example,
|
||||
``librbd`` and ``libcephfs`` leverage this functionality. You may use
|
||||
|
@ -426,6 +426,22 @@ the asynchronous writes as well as an asynchronous update to the size of the
|
||||
striped file.
|
||||
|
||||
|
||||
Debugging
|
||||
^^^^^^^^^
|
||||
|
||||
Debugging libcephsqlite can be turned on via::
|
||||
|
||||
debug_cephsqlite
|
||||
|
||||
If running the ``sqlite3`` command-line tool, use:
|
||||
|
||||
.. code:: sh
|
||||
|
||||
env CEPH_ARGS='--log_to_file true --log-file sqlite3.log --debug_cephsqlite 20 --debug_ms 1' sqlite3 ...
|
||||
|
||||
This will save all the usual Ceph debugging to a file ``sqlite3.log`` for inspection.
|
||||
|
||||
|
||||
.. _SQLite: https://sqlite.org/index.html
|
||||
.. _SQLite VFS: https://www.sqlite.org/vfs.html
|
||||
.. _SQLite Backup: https://www.sqlite.org/backup.html
|
||||
|
@ -43,17 +43,23 @@ Getting librados for C/C++
|
||||
--------------------------
|
||||
|
||||
To install ``librados`` development support files for C/C++ on Debian/Ubuntu
|
||||
distributions, execute the following::
|
||||
distributions, execute the following:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
sudo apt-get install librados-dev
|
||||
|
||||
To install ``librados`` development support files for C/C++ on RHEL/CentOS
|
||||
distributions, execute the following::
|
||||
distributions, execute the following:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
sudo yum install librados2-devel
|
||||
|
||||
Once you install ``librados`` for developers, you can find the required
|
||||
headers for C/C++ under ``/usr/include/rados``. ::
|
||||
headers for C/C++ under ``/usr/include/rados``:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ls /usr/include/rados
|
||||
|
||||
@ -68,15 +74,26 @@ and the ``librados2-devel`` package for RHEL/CentOS will install the
|
||||
directly too.
|
||||
|
||||
To install ``librados`` development support files for Python on Debian/Ubuntu
|
||||
distributions, execute the following::
|
||||
distributions, execute the following:
|
||||
|
||||
sudo apt-get install python-rados
|
||||
.. prompt:: bash $
|
||||
|
||||
sudo apt-get install python3-rados
|
||||
|
||||
To install ``librados`` development support files for Python on RHEL/CentOS
|
||||
distributions, execute the following::
|
||||
distributions, execute the following:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
sudo yum install python-rados
|
||||
|
||||
To install ``librados`` development support files for Python on SLE/openSUSE
|
||||
distributions, execute the following:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
sudo zypper install python3-rados
|
||||
|
||||
You can find the module under ``/usr/share/pyshared`` on Debian systems,
|
||||
or under ``/usr/lib/python*/site-packages`` on CentOS/RHEL systems.
|
||||
|
||||
@ -86,21 +103,29 @@ Getting librados for Java
|
||||
|
||||
To install ``librados`` for Java, you need to execute the following procedure:
|
||||
|
||||
#. Install ``jna.jar``. For Debian/Ubuntu, execute::
|
||||
#. Install ``jna.jar``. For Debian/Ubuntu, execute:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
sudo apt-get install libjna-java
|
||||
|
||||
For CentOS/RHEL, execute::
|
||||
For CentOS/RHEL, execute:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
sudo yum install jna
|
||||
|
||||
The JAR files are located in ``/usr/share/java``.
|
||||
|
||||
#. Clone the ``rados-java`` repository::
|
||||
#. Clone the ``rados-java`` repository:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
git clone --recursive https://github.com/ceph/rados-java.git
|
||||
|
||||
#. Build the ``rados-java`` repository::
|
||||
#. Build the ``rados-java`` repository:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
cd rados-java
|
||||
ant
|
||||
@ -108,13 +133,17 @@ To install ``librados`` for Java, you need to execute the following procedure:
|
||||
The JAR file is located under ``rados-java/target``.
|
||||
|
||||
#. Copy the JAR for RADOS to a common location (e.g., ``/usr/share/java``) and
|
||||
ensure that it and the JNA JAR are in your JVM's classpath. For example::
|
||||
ensure that it and the JNA JAR are in your JVM's classpath. For example:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
sudo cp target/rados-0.1.3.jar /usr/share/java/rados-0.1.3.jar
|
||||
sudo ln -s /usr/share/java/jna-3.2.7.jar /usr/lib/jvm/default-java/jre/lib/ext/jna-3.2.7.jar
|
||||
sudo ln -s /usr/share/java/rados-0.1.3.jar /usr/lib/jvm/default-java/jre/lib/ext/rados-0.1.3.jar
|
||||
|
||||
To build the documentation, execute the following::
|
||||
To build the documentation, execute the following:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ant docs
|
||||
|
||||
@ -124,19 +153,27 @@ Getting librados for PHP
|
||||
|
||||
To install the ``librados`` extension for PHP, you need to execute the following procedure:
|
||||
|
||||
#. Install php-dev. For Debian/Ubuntu, execute::
|
||||
#. Install php-dev. For Debian/Ubuntu, execute:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
sudo apt-get install php5-dev build-essential
|
||||
|
||||
For CentOS/RHEL, execute::
|
||||
For CentOS/RHEL, execute:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
sudo yum install php-devel
|
||||
|
||||
#. Clone the ``phprados`` repository::
|
||||
#. Clone the ``phprados`` repository:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
git clone https://github.com/ceph/phprados.git
|
||||
|
||||
#. Build ``phprados``::
|
||||
#. Build ``phprados``:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
cd phprados
|
||||
phpize
|
||||
@ -144,7 +181,7 @@ To install the ``librados`` extension for PHP, you need to execute the following
|
||||
make
|
||||
sudo make install
|
||||
|
||||
#. Enable ``phprados`` in php.ini by adding::
|
||||
#. Enable ``phprados`` by adding the following line to ``php.ini``::
|
||||
|
||||
extension=rados.so
|
||||
|
||||
@ -321,7 +358,9 @@ it and connecting to the cluster might look something like this:
|
||||
|
||||
}
|
||||
|
||||
Compile your client and link to ``librados`` using ``-lrados``. For example::
|
||||
Compile your client and link to ``librados`` using ``-lrados``. For example:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
gcc ceph-client.c -lrados -o ceph-client
|
||||
|
||||
@ -399,7 +438,9 @@ you to initialize a ``librados::Rados`` cluster handle object:
|
||||
|
||||
|
||||
Compile the source; then, link ``librados`` using ``-lrados``.
|
||||
For example::
|
||||
For example:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
g++ -g -c ceph-client.cc -o ceph-client.o
|
||||
g++ -g ceph-client.o -lrados -o ceph-client
|
||||
@ -436,7 +477,9 @@ into exceptions.
|
||||
print("Connected to the cluster.")
|
||||
|
||||
|
||||
Execute the example to verify that it connects to your cluster. ::
|
||||
Execute the example to verify that it connects to your cluster:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
python ceph-client.py
|
||||
|
||||
@ -478,7 +521,9 @@ binding converts C++-based errors into exceptions.
|
||||
|
||||
Compile the source; then, run it. If you have copied the JAR to
|
||||
``/usr/share/java`` and sym linked from your ``ext`` directory, you won't need
|
||||
to specify the classpath. For example::
|
||||
to specify the classpath. For example:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
javac CephClient.java
|
||||
java CephClient
|
||||
@ -502,7 +547,9 @@ With the RADOS extension enabled in PHP you can start creating a new cluster han
|
||||
}
|
||||
|
||||
|
||||
Save this as rados.php and run the code::
|
||||
Save this as rados.php and run the code:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
php rados.php
|
||||
|
||||
|
@ -68,7 +68,7 @@ Your Python client also requires a client keyring. For this example, we use the
|
||||
``client.admin`` key by default. If you would like to specify the keyring when
|
||||
creating the cluster handle, you may use the ``conf`` argument. Alternatively,
|
||||
you may specify the keyring path in your Ceph configuration file. For example,
|
||||
you may add something like the following line to you Ceph configuration file::
|
||||
you may add something like the following line to your Ceph configuration file::
|
||||
|
||||
keyring = /path/to/ceph.client.admin.keyring
|
||||
|
||||
|
@ -166,7 +166,9 @@ specify a ``keyring`` entry in your Ceph configuration file.
|
||||
We recommend copying the Ceph Storage Cluster's keyring file to nodes where you
|
||||
will run administrative commands, because it contains the ``client.admin`` key.
|
||||
|
||||
To perform this step manually, execute the following::
|
||||
To perform this step manually, execute the following:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
sudo scp {user}@{ceph-cluster-host}:/etc/ceph/ceph.client.admin.keyring /etc/ceph/ceph.client.admin.keyring
|
||||
|
||||
|
@ -42,11 +42,15 @@ it will fit). This means that if a DB device is specified but an explicit
|
||||
WAL device is not, the WAL will be implicitly colocated with the DB on the faster
|
||||
device.
|
||||
|
||||
A single-device (colocated) BlueStore OSD can be provisioned with::
|
||||
A single-device (colocated) BlueStore OSD can be provisioned with:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph-volume lvm prepare --bluestore --data <device>
|
||||
|
||||
To specify a WAL device and/or DB device, ::
|
||||
To specify a WAL device and/or DB device:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph-volume lvm prepare --bluestore --data <device> --block.wal <wal-device> --block.db <db-device>
|
||||
|
||||
@ -66,13 +70,17 @@ the deployment strategy:
|
||||
If all devices are the same type, for example all rotational drives, and
|
||||
there are no fast devices to use for metadata, it makes sense to specify the
|
||||
block device only and to not separate ``block.db`` or ``block.wal``. The
|
||||
:ref:`ceph-volume-lvm` command for a single ``/dev/sda`` device looks like::
|
||||
:ref:`ceph-volume-lvm` command for a single ``/dev/sda`` device looks like:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph-volume lvm create --bluestore --data /dev/sda
|
||||
|
||||
If logical volumes have already been created for each device, (a single LV
|
||||
using 100% of the device), then the :ref:`ceph-volume-lvm` call for an LV named
|
||||
``ceph-vg/block-lv`` would look like::
|
||||
``ceph-vg/block-lv`` would look like:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph-volume lvm create --bluestore --data ceph-vg/block-lv
|
||||
|
||||
@ -88,35 +96,43 @@ You must create these volume groups and logical volumes manually as
|
||||
the ``ceph-volume`` tool is currently not able to do so automatically.
|
||||
|
||||
For the below example, let us assume four rotational (``sda``, ``sdb``, ``sdc``, and ``sdd``)
|
||||
and one (fast) solid state drive (``sdx``). First create the volume groups::
|
||||
and one (fast) solid state drive (``sdx``). First create the volume groups:
|
||||
|
||||
$ vgcreate ceph-block-0 /dev/sda
|
||||
$ vgcreate ceph-block-1 /dev/sdb
|
||||
$ vgcreate ceph-block-2 /dev/sdc
|
||||
$ vgcreate ceph-block-3 /dev/sdd
|
||||
.. prompt:: bash $
|
||||
|
||||
Now create the logical volumes for ``block``::
|
||||
vgcreate ceph-block-0 /dev/sda
|
||||
vgcreate ceph-block-1 /dev/sdb
|
||||
vgcreate ceph-block-2 /dev/sdc
|
||||
vgcreate ceph-block-3 /dev/sdd
|
||||
|
||||
$ lvcreate -l 100%FREE -n block-0 ceph-block-0
|
||||
$ lvcreate -l 100%FREE -n block-1 ceph-block-1
|
||||
$ lvcreate -l 100%FREE -n block-2 ceph-block-2
|
||||
$ lvcreate -l 100%FREE -n block-3 ceph-block-3
|
||||
Now create the logical volumes for ``block``:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
lvcreate -l 100%FREE -n block-0 ceph-block-0
|
||||
lvcreate -l 100%FREE -n block-1 ceph-block-1
|
||||
lvcreate -l 100%FREE -n block-2 ceph-block-2
|
||||
lvcreate -l 100%FREE -n block-3 ceph-block-3
|
||||
|
||||
We are creating 4 OSDs for the four slow spinning devices, so assuming a 200GB
|
||||
SSD in ``/dev/sdx`` we will create 4 logical volumes, each of 50GB::
|
||||
SSD in ``/dev/sdx`` we will create 4 logical volumes, each of 50GB:
|
||||
|
||||
$ vgcreate ceph-db-0 /dev/sdx
|
||||
$ lvcreate -L 50GB -n db-0 ceph-db-0
|
||||
$ lvcreate -L 50GB -n db-1 ceph-db-0
|
||||
$ lvcreate -L 50GB -n db-2 ceph-db-0
|
||||
$ lvcreate -L 50GB -n db-3 ceph-db-0
|
||||
.. prompt:: bash $
|
||||
|
||||
Finally, create the 4 OSDs with ``ceph-volume``::
|
||||
vgcreate ceph-db-0 /dev/sdx
|
||||
lvcreate -L 50GB -n db-0 ceph-db-0
|
||||
lvcreate -L 50GB -n db-1 ceph-db-0
|
||||
lvcreate -L 50GB -n db-2 ceph-db-0
|
||||
lvcreate -L 50GB -n db-3 ceph-db-0
|
||||
|
||||
$ ceph-volume lvm create --bluestore --data ceph-block-0/block-0 --block.db ceph-db-0/db-0
|
||||
$ ceph-volume lvm create --bluestore --data ceph-block-1/block-1 --block.db ceph-db-0/db-1
|
||||
$ ceph-volume lvm create --bluestore --data ceph-block-2/block-2 --block.db ceph-db-0/db-2
|
||||
$ ceph-volume lvm create --bluestore --data ceph-block-3/block-3 --block.db ceph-db-0/db-3
|
||||
Finally, create the 4 OSDs with ``ceph-volume``:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph-volume lvm create --bluestore --data ceph-block-0/block-0 --block.db ceph-db-0/db-0
|
||||
ceph-volume lvm create --bluestore --data ceph-block-1/block-1 --block.db ceph-db-0/db-1
|
||||
ceph-volume lvm create --bluestore --data ceph-block-2/block-2 --block.db ceph-db-0/db-2
|
||||
ceph-volume lvm create --bluestore --data ceph-block-3/block-3 --block.db ceph-db-0/db-3
|
||||
|
||||
These operations should end up creating four OSDs, with ``block`` on the slower
|
||||
rotational drives with a 50 GB logical volume (DB) for each on the solid state
|
||||
@ -239,7 +255,9 @@ The smaller checksum values can be used by selecting `crc32c_16` or
|
||||
`crc32c_8` as the checksum algorithm.
|
||||
|
||||
The *checksum algorithm* can be set either via a per-pool
|
||||
``csum_type`` property or the global config option. For example, ::
|
||||
``csum_type`` property or the global config option. For example:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd pool set <pool-name> csum_type <algorithm>
|
||||
|
||||
@ -275,7 +293,9 @@ must be 70% of the size of the original (or smaller).
|
||||
The *compression mode*, *compression algorithm*, *compression required
|
||||
ratio*, *min blob size*, and *max blob size* can be set either via a
|
||||
per-pool property or a global config option. Pool properties can be
|
||||
set with::
|
||||
set with:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd pool set <pool-name> compression_algorithm <algorithm>
|
||||
ceph osd pool set <pool-name> compression_mode <mode>
|
||||
@ -342,16 +362,20 @@ Refer to `SPDK document`__ for more details.
|
||||
.. __: http://www.spdk.io/doc/getting_started.html#getting_started_examples
|
||||
|
||||
SPDK offers a script to configure the device automatically. Users can run the
|
||||
script as root::
|
||||
script as root:
|
||||
|
||||
$ sudo src/spdk/scripts/setup.sh
|
||||
.. prompt:: bash $
|
||||
|
||||
sudo src/spdk/scripts/setup.sh
|
||||
|
||||
You will need to specify the subject NVMe device's device selector with
|
||||
the "spdk:" prefix for ``bluestore_block_path``.
|
||||
|
||||
For example, you can find the device selector of an Intel PCIe SSD with::
|
||||
For example, you can find the device selector of an Intel PCIe SSD with:
|
||||
|
||||
$ lspci -mm -n -D -d 8086:0953
|
||||
.. prompt:: bash $
|
||||
|
||||
lspci -mm -n -D -d 8086:0953
|
||||
|
||||
The device selector always has the form of ``DDDD:BB:DD.FF`` or ``DDDD.BB.DD.FF``.
|
||||
|
||||
@ -377,3 +401,118 @@ settings to ensure that all IOs are issued through SPDK.::
|
||||
|
||||
Otherwise, the current implementation will populate the SPDK map files with
|
||||
kernel file system symbols and will use the kernel driver to issue DB/WAL IO.
|
||||
|
||||
Minimum Allocation Size
|
||||
========================
|
||||
|
||||
There is a configured minimum amount of storage that BlueStore will allocate on
|
||||
an OSD. In practice, this is the least amount of capacity that a RADOS object
|
||||
can consume. The value of :confval:`bluestore_min_alloc_size` is derived from the
|
||||
value of :confval:`bluestore_min_alloc_size_hdd` or :confval:`bluestore_min_alloc_size_ssd`
|
||||
depending on the OSD's ``rotational`` attribute. This means that when an OSD
|
||||
is created on an HDD, BlueStore will be initialized with the current value
|
||||
of :confval:`bluestore_min_alloc_size_hdd`, and SSD OSDs (including NVMe devices)
|
||||
with the value of :confval:`bluestore_min_alloc_size_ssd`.
|
||||
|
||||
Through the Mimic release, the default values were 64KB and 16KB for rotational
|
||||
(HDD) and non-rotational (SSD) media respectively. Octopus changed the default
|
||||
for SSD (non-rotational) media to 4KB, and Pacific changed the default for HDD
|
||||
(rotational) media to 4KB as well.
|
||||
|
||||
These changes were driven by space amplification experienced by Ceph RADOS
|
||||
GateWay (RGW) deployments that host large numbers of small files
|
||||
(S3/Swift objects).
|
||||
|
||||
For example, when an RGW client stores a 1KB S3 object, it is written to a
|
||||
single RADOS object. With the default :confval:`min_alloc_size` value, 4KB of
|
||||
underlying drive space is allocated. This means that roughly
|
||||
(4KB - 1KB) == 3KB is allocated but never used, which corresponds to 300%
|
||||
overhead or 25% efficiency. Similarly, a 5KB user object will be stored
|
||||
as one 4KB and one 1KB RADOS object, again stranding 4KB of device capacity,
|
||||
though in this case the overhead is a much smaller percentage. Think of this
|
||||
in terms of the remainder from a modulus operation. The overhead *percentage*
|
||||
thus decreases rapidly as user object size increases.
|
||||
|
||||
An easily missed additional subtlety is that this
|
||||
takes place for *each* replica. So when using the default three copies of
|
||||
data (3R), a 1KB S3 object actually consumes roughly 9KB of storage device
|
||||
capacity. If erasure coding (EC) is used instead of replication, the
|
||||
amplification may be even higher: for a ``k=4,m=2`` pool, our 1KB S3 object
|
||||
will allocate (6 * 4KB) = 24KB of device capacity.
|
||||
|
||||
When an RGW bucket pool contains many relatively large user objects, the effect
|
||||
of this phenomenon is often negligible, but should be considered for deployments
|
||||
that expect a significant fraction of relatively small objects.
|
||||
|
||||
The 4KB default value aligns well with conventional HDD and SSD devices. Some
|
||||
new coarse-IU (Indirection Unit) QLC SSDs however perform and wear best
|
||||
when :confval:`bluestore_min_alloc_size_ssd`
|
||||
is set at OSD creation to match the device's IU:. 8KB, 16KB, or even 64KB.
|
||||
These novel storage drives allow one to achieve read performance competitive
|
||||
with conventional TLC SSDs and write performance faster than HDDs, with
|
||||
high density and lower cost than TLC SSDs.
|
||||
|
||||
Note that when creating OSDs on these devices, one must carefully apply the
|
||||
non-default value only to appropriate devices, and not to conventional SSD and
|
||||
HDD devices. This may be done through careful ordering of OSD creation, custom
|
||||
OSD device classes, and especially by the use of central configuration _masks_.
|
||||
|
||||
Quincy and later releases add
|
||||
the :confval:`bluestore_use_optimal_io_size_for_min_alloc_size`
|
||||
option that enables automatic discovery of the appropriate value as each OSD is
|
||||
created. Note that the use of ``bcache``, ``OpenCAS``, ``dmcrypt``,
|
||||
``ATA over Ethernet``, `iSCSI`, or other device layering / abstraction
|
||||
technologies may confound the determination of appropriate values. OSDs
|
||||
deployed on top of VMware storage have been reported to also
|
||||
sometimes report a ``rotational`` attribute that does not match the underlying
|
||||
hardware.
|
||||
|
||||
We suggest inspecting such OSDs at startup via logs and admin sockets to ensure that
|
||||
behavior is appropriate. Note that this also may not work as desired with
|
||||
older kernels. You can check for this by examining the presence and value
|
||||
of ``/sys/block/<drive>/queue/optimal_io_size``.
|
||||
|
||||
You may also inspect a given OSD:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
ceph osd metadata osd.1701 | grep rotational
|
||||
|
||||
This space amplification may manifest as an unusually high ratio of raw to
|
||||
stored data reported by ``ceph df``. ``ceph osd df`` may also report
|
||||
anomalously high ``%USE`` / ``VAR`` values when
|
||||
compared to other, ostensibly identical OSDs. A pool using OSDs with
|
||||
mismatched ``min_alloc_size`` values may experience unexpected balancer
|
||||
behavior as well.
|
||||
|
||||
Note that this BlueStore attribute takes effect *only* at OSD creation; if
|
||||
changed later, a given OSD's behavior will not change unless / until it is
|
||||
destroyed and redeployed with the appropriate option value(s). Upgrading
|
||||
to a later Ceph release will *not* change the value used by OSDs deployed
|
||||
under older releases or with other settings.
|
||||
|
||||
|
||||
.. confval:: bluestore_min_alloc_size
|
||||
.. confval:: bluestore_min_alloc_size_hdd
|
||||
.. confval:: bluestore_min_alloc_size_ssd
|
||||
.. confval:: bluestore_use_optimal_io_size_for_min_alloc_size
|
||||
|
||||
DSA (Data Streaming Accelerator Usage)
|
||||
======================================
|
||||
|
||||
If you want to use the DML library to drive DSA device for offloading
|
||||
read/write operations on Persist memory in Bluestore. You need to install
|
||||
`DML`_ and `idxd-config`_ library in your machine with SPR (Sapphire Rapids) CPU.
|
||||
|
||||
.. _DML: https://github.com/intel/DML
|
||||
.. _idxd-config: https://github.com/intel/idxd-config
|
||||
|
||||
After installing the DML software, you need to configure the shared
|
||||
work queues (WQs) with the following WQ configuration example via accel-config tool:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
accel-config config-wq --group-id=1 --mode=shared --wq-size=16 --threshold=15 --type=user --name="MyApp1" --priority=10 --block-on-fault=1 dsa0/wq0.1
|
||||
accel-config config-engine dsa0/engine0.1 --group-id=1
|
||||
accel-config enable-device dsa0
|
||||
accel-config enable-wq dsa0/wq0.1
|
||||
|
@ -486,17 +486,26 @@ The following CLI commands are used to configure the cluster:
|
||||
Help
|
||||
====
|
||||
|
||||
You can get help for a particular option with::
|
||||
You can get help for a particular option with:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph config help <option>
|
||||
|
||||
Note that this will use the configuration schema that is compiled into the running monitors. If you have a mixed-version cluster (e.g., during an upgrade), you might also want to query the option schema from a specific running daemon::
|
||||
Note that this will use the configuration schema that is compiled into the running monitors. If you have a mixed-version cluster (e.g., during an upgrade), you might also want to query the option schema from a specific running daemon:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph daemon <name> config help [option]
|
||||
|
||||
For example,::
|
||||
For example:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph config help log_file
|
||||
|
||||
::
|
||||
|
||||
$ ceph config help log_file
|
||||
log_file - path to log file
|
||||
(std::string, basic)
|
||||
Default (non-daemon):
|
||||
@ -504,9 +513,14 @@ For example,::
|
||||
Can update at runtime: false
|
||||
See also: [log_to_stderr,err_to_stderr,log_to_syslog,err_to_syslog]
|
||||
|
||||
or::
|
||||
or:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph config help log_file -f json-pretty
|
||||
|
||||
::
|
||||
|
||||
$ ceph config help log_file -f json-pretty
|
||||
{
|
||||
"name": "log_file",
|
||||
"type": "std::string",
|
||||
@ -537,81 +551,107 @@ testing purposes, and are not recommended for use by operators.
|
||||
Runtime Changes
|
||||
===============
|
||||
|
||||
In most cases, Ceph allows you to make changes to the configuration of
|
||||
a daemon at runtime. This capability is quite useful for
|
||||
increasing/decreasing logging output, enabling/disabling debug
|
||||
settings, and even for runtime optimization.
|
||||
In most cases, Ceph permits changes to the configuration of a daemon at
|
||||
runtime. This can be used for increasing or decreasing the amount of logging
|
||||
output, for enabling or disabling debug settings, and for runtime optimization.
|
||||
|
||||
Generally speaking, configuration options can be updated in the usual
|
||||
way via the ``ceph config set`` command. For example, do enable the debug log level on a specific OSD,::
|
||||
Configuration options can be updated via the ``ceph config set`` command. For
|
||||
example, to enable the debug log level on a specific OSD, run a command of this form:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph config set osd.123 debug_ms 20
|
||||
|
||||
Note that if the same option is also customized in a local
|
||||
configuration file, the monitor setting will be ignored (it has a
|
||||
lower priority than the local config file).
|
||||
.. note:: If an option has been customized in a local configuration file, the
|
||||
`central config
|
||||
<https://ceph.io/en/news/blog/2018/new-mimic-centralized-configuration-management/>`_
|
||||
setting will be ignored (it has a lower priority than the local
|
||||
configuration file).
|
||||
|
||||
Override values
|
||||
---------------
|
||||
|
||||
You can also temporarily set an option using the `tell` or `daemon`
|
||||
interfaces on the Ceph CLI. These *override* values are ephemeral in
|
||||
that they only affect the running process and are discarded/lost if
|
||||
the daemon or process restarts.
|
||||
Options can be set temporarily by using the `tell` or `daemon` interfaces on
|
||||
the Ceph CLI. These *override* values are ephemeral, which means that they
|
||||
affect only the current instance of the daemon and revert to persistently
|
||||
configured values when the daemon restarts.
|
||||
|
||||
Override values can be set in two ways:
|
||||
|
||||
#. From any host, we can send a message to a daemon over the network with::
|
||||
#. From any host, send a message to a daemon with a command of the following
|
||||
form:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph tell <name> config set <option> <value>
|
||||
|
||||
For example,::
|
||||
For example:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph tell osd.123 config set debug_osd 20
|
||||
|
||||
The `tell` command can also accept a wildcard for the daemon
|
||||
identifier. For example, to adjust the debug level on all OSD
|
||||
daemons,::
|
||||
The ``tell`` command can also accept a wildcard as the daemon identifier.
|
||||
For example, to adjust the debug level on all OSD daemons, run a command of
|
||||
this form:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph tell osd.* config set debug_osd 20
|
||||
|
||||
#. From the host the process is running on, we can connect directly to
|
||||
the process via a socket in ``/var/run/ceph`` with::
|
||||
#. On the host where the daemon is running, connect to the daemon via a socket
|
||||
in ``/var/run/ceph`` by running a command of this form:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph daemon <name> config set <option> <value>
|
||||
|
||||
For example,::
|
||||
For example:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph daemon osd.4 config set debug_osd 20
|
||||
|
||||
Note that in the ``ceph config show`` command output these temporary
|
||||
values will be shown with a source of ``override``.
|
||||
.. note:: In the output of the ``ceph config show`` command, these temporary
|
||||
values are shown with a source of ``override``.
|
||||
|
||||
|
||||
Viewing runtime settings
|
||||
========================
|
||||
|
||||
You can see the current options set for a running daemon with the ``ceph config show`` command. For example,::
|
||||
You can see the current options set for a running daemon with the ``ceph config show`` command. For example:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph config show osd.0
|
||||
|
||||
will show you the (non-default) options for that daemon. You can also look at a specific option with::
|
||||
will show you the (non-default) options for that daemon. You can also look at a specific option with:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph config show osd.0 debug_osd
|
||||
|
||||
or view all options (even those with default values) with::
|
||||
or view all options (even those with default values) with:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph config show-with-defaults osd.0
|
||||
|
||||
You can also observe settings for a running daemon by connecting to it from the local host via the admin socket. For example,::
|
||||
You can also observe settings for a running daemon by connecting to it from the local host via the admin socket. For example:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph daemon osd.0 config show
|
||||
|
||||
will dump all current settings,::
|
||||
will dump all current settings:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph daemon osd.0 config diff
|
||||
|
||||
will show only non-default settings (as well as where the value came from: a config file, the monitor, an override, etc.), and::
|
||||
will show only non-default settings (as well as where the value came from: a config file, the monitor, an override, etc.), and:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph daemon osd.0 config get debug_osd
|
||||
|
||||
|
@ -2,12 +2,13 @@
|
||||
Configuration
|
||||
===============
|
||||
|
||||
Each Ceph process, daemon, or utility draws its configuration from
|
||||
several sources on startup, include a local configuration, the
|
||||
monitors, the command line, or environment variables. Configuration
|
||||
options may be set globally such that they apply to all daemons, to
|
||||
all daemons or services of a particular type, or only to a specific
|
||||
daemon, process, or client.
|
||||
Each Ceph process, daemon, or utility draws its configuration from several
|
||||
sources on startup. Such sources can include (1) a local configuration, (2) the
|
||||
monitors, (3) the command line, and (4) environment variables.
|
||||
|
||||
Configuration options can be set globally so that they apply (1) to all
|
||||
daemons, (2) to all daemons or services of a particular type, or (3) to only a
|
||||
specific daemon, process, or client.
|
||||
|
||||
.. raw:: html
|
||||
|
||||
|
@ -88,6 +88,14 @@ Users can choose between the following built-in profile types:
|
||||
.. note:: The values mentioned in the tables below represent the percentage
|
||||
of the total IOPS capacity of the OSD allocated for the service type.
|
||||
|
||||
By default, the *high_client_ops* profile is enabled to ensure that a larger
|
||||
chunk of the bandwidth allocation goes to client ops. Background recovery ops
|
||||
are given lower allocation (and therefore take a longer time to complete). But
|
||||
there might be instances that necessitate giving higher allocations to either
|
||||
client ops or recovery ops. In order to deal with such a situation, the
|
||||
alternate built-in profiles may be enabled by following the steps mentioned
|
||||
in next sections.
|
||||
|
||||
high_client_ops (*default*)
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
This profile optimizes client performance over background activities by
|
||||
@ -143,10 +151,7 @@ within the OSD.
|
||||
+------------------------+-------------+--------+-------+
|
||||
|
||||
.. note:: Across the built-in profiles, internal background best-effort clients
|
||||
of mclock ("scrub", "snap trim", and "pg deletion") are given lower
|
||||
reservations but no limits(MAX). This ensures that requests from such
|
||||
clients are able to complete quickly if there are no other competing
|
||||
operations.
|
||||
of mclock include "scrub", "snap trim", and "pg deletion" operations.
|
||||
|
||||
|
||||
Custom Profile
|
||||
@ -158,9 +163,13 @@ users, who understand mclock and Ceph related configuration options.
|
||||
|
||||
.. index:: mclock; built-in profiles
|
||||
|
||||
mClock Built-in Profiles
|
||||
========================
|
||||
mClock Built-in Profiles - Locked Config Options
|
||||
=================================================
|
||||
The below sections describe the config options that are locked to certain values
|
||||
in order to ensure mClock scheduler is able to provide predictable QoS.
|
||||
|
||||
mClock Config Options
|
||||
---------------------
|
||||
When a built-in profile is enabled, the mClock scheduler calculates the low
|
||||
level mclock parameters [*reservation*, *weight*, *limit*] based on the profile
|
||||
enabled for each client type. The mclock parameters are calculated based on
|
||||
@ -177,24 +186,40 @@ config parameters cannot be modified when using any of the built-in profiles:
|
||||
- :confval:`osd_mclock_scheduler_background_best_effort_wgt`
|
||||
- :confval:`osd_mclock_scheduler_background_best_effort_lim`
|
||||
|
||||
The following Ceph options will not be modifiable by the user:
|
||||
Recovery/Backfill Options
|
||||
-------------------------
|
||||
The following recovery and backfill related Ceph options are set to new defaults
|
||||
for mClock:
|
||||
|
||||
- :confval:`osd_max_backfills`
|
||||
- :confval:`osd_recovery_max_active`
|
||||
- :confval:`osd_recovery_max_active_hdd`
|
||||
- :confval:`osd_recovery_max_active_ssd`
|
||||
|
||||
This is because the above options are internally modified by the mclock
|
||||
scheduler in order to maximize the impact of the set profile.
|
||||
The following table shows the new mClock defaults. This is done to maximize the
|
||||
impact of the built-in profile:
|
||||
|
||||
By default, the *high_client_ops* profile is enabled to ensure that a larger
|
||||
chunk of the bandwidth allocation goes to client ops. Background recovery ops
|
||||
are given lower allocation (and therefore take a longer time to complete). But
|
||||
there might be instances that necessitate giving higher allocations to either
|
||||
client ops or recovery ops. In order to deal with such a situation, the
|
||||
alternate built-in profiles may be enabled by following the steps mentioned
|
||||
in the next section.
|
||||
+----------------------------------------+------------------+----------------+
|
||||
| Config Option | Original Default | mClock Default |
|
||||
+========================================+==================+================+
|
||||
| :confval:`osd_max_backfills` | 1 | 10 |
|
||||
+----------------------------------------+------------------+----------------+
|
||||
| :confval:`osd_recovery_max_active` | 0 | 0 |
|
||||
+----------------------------------------+------------------+----------------+
|
||||
| :confval:`osd_recovery_max_active_hdd` | 3 | 10 |
|
||||
+----------------------------------------+------------------+----------------+
|
||||
| :confval:`osd_recovery_max_active_ssd` | 10 | 20 |
|
||||
+----------------------------------------+------------------+----------------+
|
||||
|
||||
The above mClock defaults, can be modified if necessary by enabling
|
||||
:confval:`osd_mclock_override_recovery_settings` (default: false). The
|
||||
steps for this is discussed in the
|
||||
`Steps to Modify mClock Max Backfills/Recovery Limits`_ section.
|
||||
|
||||
Sleep Options
|
||||
-------------
|
||||
If any mClock profile (including "custom") is active, the following Ceph config
|
||||
sleep options will be disabled,
|
||||
sleep options are disabled (set to 0),
|
||||
|
||||
- :confval:`osd_recovery_sleep`
|
||||
- :confval:`osd_recovery_sleep_hdd`
|
||||
@ -386,6 +411,70 @@ The individual QoS-related config options for the *custom* profile can also be
|
||||
modified ephemerally using the above commands.
|
||||
|
||||
|
||||
Steps to Modify mClock Max Backfills/Recovery Limits
|
||||
====================================================
|
||||
|
||||
This section describes the steps to modify the default max backfills or recovery
|
||||
limits if the need arises.
|
||||
|
||||
.. warning:: This section is for advanced users or for experimental testing. The
|
||||
recommendation is to retain the defaults as is on a running cluster as
|
||||
modifying them could have unexpected performance outcomes. The values may
|
||||
be modified only if the cluster is unable to cope/showing poor performance
|
||||
with the default settings or for performing experiments on a test cluster.
|
||||
|
||||
.. important:: The max backfill/recovery options that can be modified are listed
|
||||
in section `Recovery/Backfill Options`_. The modification of the mClock
|
||||
default backfills/recovery limit is gated by the
|
||||
:confval:`osd_mclock_override_recovery_settings` option, which is set to
|
||||
*false* by default. Attempting to modify any default recovery/backfill
|
||||
limits without setting the gating option will reset that option back to the
|
||||
mClock defaults along with a warning message logged in the cluster log. Note
|
||||
that it may take a few seconds for the default value to come back into
|
||||
effect. Verify the limit using the *config show* command as shown below.
|
||||
|
||||
#. Set the :confval:`osd_mclock_override_recovery_settings` config option on all
|
||||
osds to *true* using:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
ceph config set osd osd_mclock_override_recovery_settings true
|
||||
|
||||
#. Set the desired max backfill/recovery option using:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
ceph config set osd osd_max_backfills <value>
|
||||
|
||||
For example, the following command modifies the :confval:`osd_max_backfills`
|
||||
option on all osds to 5.
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
ceph config set osd osd_max_backfills 5
|
||||
|
||||
#. Wait for a few seconds and verify the running configuration for a specific
|
||||
OSD using:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
ceph config show osd.N | grep osd_max_backfills
|
||||
|
||||
For example, the following command shows the running configuration of
|
||||
:confval:`osd_max_backfills` on osd.0.
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
ceph config show osd.0 | grep osd_max_backfills
|
||||
|
||||
#. Reset the :confval:`osd_mclock_override_recovery_settings` config option on
|
||||
all osds to *false* using:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
ceph config set osd osd_mclock_override_recovery_settings false
|
||||
|
||||
|
||||
OSD Capacity Determination (Automated)
|
||||
======================================
|
||||
|
||||
@ -413,6 +502,46 @@ node whose underlying device type is SSD:
|
||||
|
||||
ceph config show osd.0 osd_mclock_max_capacity_iops_ssd
|
||||
|
||||
Mitigation of Unrealistic OSD Capacity From Automated Test
|
||||
----------------------------------------------------------
|
||||
In certain conditions, the OSD bench tool may show unrealistic/inflated result
|
||||
depending on the drive configuration and other environment related conditions.
|
||||
To mitigate the performance impact due to this unrealistic capacity, a couple
|
||||
of threshold config options depending on the osd's device type are defined and
|
||||
used:
|
||||
|
||||
- :confval:`osd_mclock_iops_capacity_threshold_hdd` = 500
|
||||
- :confval:`osd_mclock_iops_capacity_threshold_ssd` = 80000
|
||||
|
||||
The following automated step is performed:
|
||||
|
||||
Fallback to using default OSD capacity (automated)
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
If OSD bench reports a measurement that exceeds the above threshold values
|
||||
depending on the underlying device type, the fallback mechanism reverts to the
|
||||
default value of :confval:`osd_mclock_max_capacity_iops_hdd` or
|
||||
:confval:`osd_mclock_max_capacity_iops_ssd`. The threshold config options
|
||||
can be reconfigured based on the type of drive used. Additionally, a cluster
|
||||
warning is logged in case the measurement exceeds the threshold. For example, ::
|
||||
|
||||
2022-10-27T15:30:23.270+0000 7f9b5dbe95c0 0 log_channel(cluster) log [WRN]
|
||||
: OSD bench result of 39546.479392 IOPS exceeded the threshold limit of
|
||||
25000.000000 IOPS for osd.1. IOPS capacity is unchanged at 21500.000000
|
||||
IOPS. The recommendation is to establish the osd's IOPS capacity using other
|
||||
benchmark tools (e.g. Fio) and then override
|
||||
osd_mclock_max_capacity_iops_[hdd|ssd].
|
||||
|
||||
If the default capacity doesn't accurately represent the OSD's capacity, the
|
||||
following additional step is recommended to address this:
|
||||
|
||||
Run custom drive benchmark if defaults are not accurate (manual)
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
If the default OSD capacity is not accurate, the recommendation is to run a
|
||||
custom benchmark using your preferred tool (e.g. Fio) on the drive and then
|
||||
override the ``osd_mclock_max_capacity_iops_[hdd, ssd]`` option as described
|
||||
in the `Specifying Max OSD Capacity`_ section.
|
||||
|
||||
This step is highly recommended until an alternate mechansim is worked upon.
|
||||
|
||||
Steps to Manually Benchmark an OSD (Optional)
|
||||
=============================================
|
||||
@ -426,9 +555,10 @@ Steps to Manually Benchmark an OSD (Optional)
|
||||
`Specifying Max OSD Capacity`_.
|
||||
|
||||
|
||||
Any existing benchmarking tool can be used for this purpose. In this case, the
|
||||
steps use the *Ceph OSD Bench* command described in the next section. Regardless
|
||||
of the tool/command used, the steps outlined further below remain the same.
|
||||
Any existing benchmarking tool (e.g. Fio) can be used for this purpose. In this
|
||||
case, the steps use the *Ceph OSD Bench* command described in the next section.
|
||||
Regardless of the tool/command used, the steps outlined further below remain the
|
||||
same.
|
||||
|
||||
As already described in the :ref:`dmclock-qos` section, the number of
|
||||
shards and the bluestore's throttle parameters have an impact on the mclock op
|
||||
@ -551,5 +681,8 @@ mClock Config Options
|
||||
.. confval:: osd_mclock_cost_per_byte_usec_ssd
|
||||
.. confval:: osd_mclock_force_run_benchmark_on_init
|
||||
.. confval:: osd_mclock_skip_benchmark
|
||||
.. confval:: osd_mclock_override_recovery_settings
|
||||
.. confval:: osd_mclock_iops_capacity_threshold_hdd
|
||||
.. confval:: osd_mclock_iops_capacity_threshold_ssd
|
||||
|
||||
.. _the dmClock algorithm: https://www.usenix.org/legacy/event/osdi10/tech/full_papers/Gulati.pdf
|
||||
|
@ -1,3 +1,5 @@
|
||||
.. _monitor-config-reference:
|
||||
|
||||
==========================
|
||||
Monitor Config Reference
|
||||
==========================
|
||||
|
@ -188,14 +188,19 @@ By default, ``ms_bind_msgr2`` is true starting with Nautilus 14.2.z.
|
||||
However, until the monitors start using v2, only limited services will
|
||||
start advertising v2 addresses.
|
||||
|
||||
For most users, the monitors are binding to the default legacy port ``6789`` for the v1 protocol. When this is the case, enabling v2 is as simple as::
|
||||
For most users, the monitors are binding to the default legacy port ``6789``
|
||||
for the v1 protocol. When this is the case, enabling v2 is as simple as:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph mon enable-msgr2
|
||||
|
||||
If the monitors are bound to non-standard ports, you will need to
|
||||
specify an additional port for v2 explicitly. For example, if your
|
||||
monitor ``mon.a`` binds to ``1.2.3.4:1111``, and you want to add v2 on
|
||||
port ``1112``,::
|
||||
port ``1112``:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph mon set-addrs a [v2:1.2.3.4:1112,v1:1.2.3.4:1111]
|
||||
|
||||
|
@ -60,6 +60,8 @@ By default, daemons `bind`_ to ports within the ``6800:7300`` range. You may
|
||||
configure this range at your discretion. Before configuring your IP tables,
|
||||
check the default ``iptables`` configuration.
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
sudo iptables -L
|
||||
|
||||
Some Linux distributions include rules that reject all inbound requests
|
||||
@ -80,7 +82,9 @@ default. Additionally, Ceph Monitors always operate on the public
|
||||
network. When you add the rule using the example below, make sure you
|
||||
replace ``{iface}`` with the public network interface (e.g., ``eth0``,
|
||||
``eth1``, etc.), ``{ip-address}`` with the IP address of the public
|
||||
network and ``{netmask}`` with the netmask for the public network. ::
|
||||
network and ``{netmask}`` with the netmask for the public network. :
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
sudo iptables -A INPUT -i {iface} -p tcp -s {ip-address}/{netmask} --dport 6789 -j ACCEPT
|
||||
|
||||
@ -98,7 +102,9 @@ you replace ``{iface}`` with the public network interface (e.g., ``eth0``,
|
||||
``eth1``, etc.), ``{ip-address}`` with the IP address of the public network
|
||||
and ``{netmask}`` with the netmask of the public network.
|
||||
|
||||
For example::
|
||||
For example:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
sudo iptables -A INPUT -i {iface} -m multiport -p tcp -s {ip-address}/{netmask} --dports 6800:7300 -j ACCEPT
|
||||
|
||||
@ -139,7 +145,9 @@ the public network and other Ceph OSD Daemons will connect using the cluster
|
||||
network. When you add the rule using the example below, make sure you replace
|
||||
``{iface}`` with the network interface (e.g., ``eth0``, ``eth1``, etc.),
|
||||
``{ip-address}`` with the IP address and ``{netmask}`` with the netmask of the
|
||||
public or cluster network. For example::
|
||||
public or cluster network. For example:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
sudo iptables -A INPUT -i {iface} -m multiport -p tcp -s {ip-address}/{netmask} --dports 6800:7300 -j ACCEPT
|
||||
|
||||
|
@ -4,13 +4,12 @@
|
||||
|
||||
.. index:: pools; configuration
|
||||
|
||||
When you create pools and set the number of placement groups (PGs) for each, Ceph
|
||||
uses default values when you don't specifically override the defaults. **We
|
||||
recommend** overriding some of the defaults. Specifically, we recommend setting
|
||||
a pool's replica size and overriding the default number of placement groups. You
|
||||
can specifically set these values when running `pool`_ commands. You can also
|
||||
override the defaults by adding new ones in the ``[global]`` section of your
|
||||
Ceph configuration file.
|
||||
Ceph uses default values to determine how many placement groups (PGs) will be
|
||||
assigned to each pool. We recommend overriding some of the defaults.
|
||||
Specifically, we recommend setting a pool's replica size and overriding the
|
||||
default number of placement groups. You can set these values when running
|
||||
`pool`_ commands. You can also override the defaults by adding new ones in the
|
||||
``[global]`` section of your Ceph configuration file.
|
||||
|
||||
|
||||
.. literalinclude:: pool-pg.conf
|
||||
|
@ -4,6 +4,8 @@
|
||||
|
||||
There are several Ceph daemons in a storage cluster:
|
||||
|
||||
.. _rados_configuration_storage-devices_ceph_osd:
|
||||
|
||||
* **Ceph OSDs** (Object Storage Daemons) store most of the data
|
||||
in Ceph. Usually each OSD is backed by a single storage device.
|
||||
This can be a traditional hard disk (HDD) or a solid state disk
|
||||
@ -24,13 +26,14 @@ There are several Ceph daemons in a storage cluster:
|
||||
monitoring and management systems.
|
||||
|
||||
|
||||
OSD Backends
|
||||
============
|
||||
OSD Back Ends
|
||||
=============
|
||||
|
||||
There are two ways that OSDs manage the data they store.
|
||||
As of the Luminous 12.2.z release, the default (and recommended) backend is
|
||||
*BlueStore*. Prior to the Luminous release, the default (and only option) was
|
||||
*Filestore*.
|
||||
There are two ways that OSDs manage the data they store. As of the Luminous
|
||||
12.2.z release, the default (and recommended) back end is *BlueStore*. Prior
|
||||
to the Luminous release, the default (and only) back end was *Filestore*.
|
||||
|
||||
.. _rados_config_storage_devices_bluestore:
|
||||
|
||||
BlueStore
|
||||
---------
|
||||
|
@ -95,7 +95,9 @@ without the ``mon.`` prefix (i.e., ``{mon-id}`` should be the ``a``
|
||||
on ``mon.a``).
|
||||
|
||||
#. Create the default directory on the machine that will host your
|
||||
new monitor. ::
|
||||
new monitor:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ssh {new-mon-host}
|
||||
sudo mkdir /var/lib/ceph/mon/ceph-{mon-id}
|
||||
@ -103,26 +105,34 @@ on ``mon.a``).
|
||||
#. Create a temporary directory ``{tmp}`` to keep the files needed during
|
||||
this process. This directory should be different from the monitor's default
|
||||
directory created in the previous step, and can be removed after all the
|
||||
steps are executed. ::
|
||||
steps are executed:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
mkdir {tmp}
|
||||
|
||||
#. Retrieve the keyring for your monitors, where ``{tmp}`` is the path to
|
||||
the retrieved keyring, and ``{key-filename}`` is the name of the file
|
||||
containing the retrieved monitor key. ::
|
||||
containing the retrieved monitor key:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph auth get mon. -o {tmp}/{key-filename}
|
||||
|
||||
#. Retrieve the monitor map, where ``{tmp}`` is the path to
|
||||
the retrieved monitor map, and ``{map-filename}`` is the name of the file
|
||||
containing the retrieved monitor map. ::
|
||||
containing the retrieved monitor map:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph mon getmap -o {tmp}/{map-filename}
|
||||
|
||||
#. Prepare the monitor's data directory created in the first step. You must
|
||||
specify the path to the monitor map so that you can retrieve the
|
||||
information about a quorum of monitors and their ``fsid``. You must also
|
||||
specify a path to the monitor keyring::
|
||||
specify a path to the monitor keyring:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
sudo ceph-mon -i {mon-id} --mkfs --monmap {tmp}/{map-filename} --keyring {tmp}/{key-filename}
|
||||
|
||||
@ -130,7 +140,9 @@ on ``mon.a``).
|
||||
#. Start the new monitor and it will automatically join the cluster.
|
||||
The daemon needs to know which address to bind to, via either the
|
||||
``--public-addr {ip}`` or ``--public-network {network}`` argument.
|
||||
For example::
|
||||
For example:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph-mon -i {mon-id} --public-addr {ip:port}
|
||||
|
||||
@ -154,11 +166,15 @@ procedure results in only two monitor daemons, you may add or remove another
|
||||
monitor until you have a number of ``ceph-mon`` daemons that can achieve a
|
||||
quorum.
|
||||
|
||||
#. Stop the monitor. ::
|
||||
#. Stop the monitor:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
service ceph -a stop mon.{mon-id}
|
||||
|
||||
#. Remove the monitor from the cluster. ::
|
||||
#. Remove the monitor from the cluster:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph mon remove {mon-id}
|
||||
|
||||
@ -174,37 +190,60 @@ cluster, for example a cluster where the monitors cannot form a
|
||||
quorum.
|
||||
|
||||
|
||||
#. Stop all ``ceph-mon`` daemons on all monitor hosts. ::
|
||||
#. Stop all ``ceph-mon`` daemons on all monitor hosts:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ssh {mon-host}
|
||||
systemctl stop ceph-mon.target
|
||||
# and repeat for all mons
|
||||
|
||||
#. Identify a surviving monitor and log in to that host. ::
|
||||
Repeat for all monitor hosts.
|
||||
|
||||
#. Identify a surviving monitor and log in to that host:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ssh {mon-host}
|
||||
|
||||
#. Extract a copy of the monmap file. ::
|
||||
#. Extract a copy of the monmap file:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph-mon -i {mon-id} --extract-monmap {map-path}
|
||||
# in most cases, that's
|
||||
|
||||
In most cases, this command will be:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph-mon -i `hostname` --extract-monmap /tmp/monmap
|
||||
|
||||
#. Remove the non-surviving or problematic monitors. For example, if
|
||||
you have three monitors, ``mon.a``, ``mon.b``, and ``mon.c``, where
|
||||
only ``mon.a`` will survive, follow the example below::
|
||||
only ``mon.a`` will survive, follow the example below:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
monmaptool {map-path} --rm {mon-id}
|
||||
# for example,
|
||||
|
||||
For example,
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
monmaptool /tmp/monmap --rm b
|
||||
monmaptool /tmp/monmap --rm c
|
||||
|
||||
#. Inject the surviving map with the removed monitors into the
|
||||
surviving monitor(s). For example, to inject a map into monitor
|
||||
``mon.a``, follow the example below::
|
||||
``mon.a``, follow the example below:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph-mon -i {mon-id} --inject-monmap {map-path}
|
||||
# for example,
|
||||
|
||||
For example:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph-mon -i a --inject-monmap /tmp/monmap
|
||||
|
||||
#. Start only the surviving monitors.
|
||||
@ -316,13 +355,19 @@ networks are unable to communicate. Use the following procedure:
|
||||
|
||||
#. Retrieve the monitor map, where ``{tmp}`` is the path to
|
||||
the retrieved monitor map, and ``{filename}`` is the name of the file
|
||||
containing the retrieved monitor map. ::
|
||||
containing the retrieved monitor map:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph mon getmap -o {tmp}/{filename}
|
||||
|
||||
#. The following example demonstrates the contents of the monmap. ::
|
||||
#. The following example demonstrates the contents of the monmap:
|
||||
|
||||
$ monmaptool --print {tmp}/{filename}
|
||||
.. prompt:: bash $
|
||||
|
||||
monmaptool --print {tmp}/{filename}
|
||||
|
||||
::
|
||||
|
||||
monmaptool: monmap file {tmp}/{filename}
|
||||
epoch 1
|
||||
@ -333,9 +378,14 @@ networks are unable to communicate. Use the following procedure:
|
||||
1: 10.0.0.2:6789/0 mon.b
|
||||
2: 10.0.0.3:6789/0 mon.c
|
||||
|
||||
#. Remove the existing monitors. ::
|
||||
#. Remove the existing monitors:
|
||||
|
||||
$ monmaptool --rm a --rm b --rm c {tmp}/{filename}
|
||||
.. prompt:: bash $
|
||||
|
||||
monmaptool --rm a --rm b --rm c {tmp}/{filename}
|
||||
|
||||
|
||||
::
|
||||
|
||||
monmaptool: monmap file {tmp}/{filename}
|
||||
monmaptool: removing a
|
||||
@ -343,16 +393,25 @@ networks are unable to communicate. Use the following procedure:
|
||||
monmaptool: removing c
|
||||
monmaptool: writing epoch 1 to {tmp}/{filename} (0 monitors)
|
||||
|
||||
#. Add the new monitor locations. ::
|
||||
#. Add the new monitor locations:
|
||||
|
||||
$ monmaptool --add a 10.1.0.1:6789 --add b 10.1.0.2:6789 --add c 10.1.0.3:6789 {tmp}/{filename}
|
||||
.. prompt:: bash $
|
||||
|
||||
monmaptool --add a 10.1.0.1:6789 --add b 10.1.0.2:6789 --add c 10.1.0.3:6789 {tmp}/{filename}
|
||||
|
||||
|
||||
::
|
||||
|
||||
monmaptool: monmap file {tmp}/{filename}
|
||||
monmaptool: writing epoch 1 to {tmp}/{filename} (3 monitors)
|
||||
|
||||
#. Check new contents. ::
|
||||
#. Check new contents:
|
||||
|
||||
$ monmaptool --print {tmp}/{filename}
|
||||
.. prompt:: bash $
|
||||
|
||||
monmaptool --print {tmp}/{filename}
|
||||
|
||||
::
|
||||
|
||||
monmaptool: monmap file {tmp}/{filename}
|
||||
epoch 1
|
||||
@ -370,7 +429,9 @@ monitors, and inject the modified monmap into each new monitor.
|
||||
#. First, make sure to stop all your monitors. Injection must be done while
|
||||
the daemon is not running.
|
||||
|
||||
#. Inject the monmap. ::
|
||||
#. Inject the monmap:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph-mon -i {mon-id} --inject-monmap {tmp}/{filename}
|
||||
|
||||
|
@ -71,7 +71,9 @@ weight).
|
||||
|
||||
#. Create the OSD. If no UUID is given, it will be set automatically when the
|
||||
OSD starts up. The following command will output the OSD number, which you
|
||||
will need for subsequent steps. ::
|
||||
will need for subsequent steps:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd create [{uuid} [{id}]]
|
||||
|
||||
@ -84,21 +86,25 @@ weight).
|
||||
clusters are large. If {id} is not specified, the smallest available is
|
||||
used.
|
||||
|
||||
#. Create the default directory on your new OSD. ::
|
||||
#. Create the default directory on your new OSD:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ssh {new-osd-host}
|
||||
sudo mkdir /var/lib/ceph/osd/ceph-{osd-number}
|
||||
|
||||
|
||||
#. If the OSD is for a drive other than the OS drive, prepare it
|
||||
for use with Ceph, and mount it to the directory you just created::
|
||||
for use with Ceph, and mount it to the directory you just created:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ssh {new-osd-host}
|
||||
sudo mkfs -t {fstype} /dev/{drive}
|
||||
sudo mount -o user_xattr /dev/{hdd} /var/lib/ceph/osd/ceph-{osd-number}
|
||||
|
||||
#. Initialize the OSD data directory:
|
||||
|
||||
#. Initialize the OSD data directory. ::
|
||||
.. prompt:: bash $
|
||||
|
||||
ssh {new-osd-host}
|
||||
ceph-osd -i {osd-num} --mkfs --mkkey
|
||||
@ -107,11 +113,12 @@ weight).
|
||||
|
||||
#. Register the OSD authentication key. The value of ``ceph`` for
|
||||
``ceph-{osd-num}`` in the path is the ``$cluster-$id``. If your
|
||||
cluster name differs from ``ceph``, use your cluster name instead.::
|
||||
cluster name differs from ``ceph``, use your cluster name instead:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph auth add osd.{osd-num} osd 'allow *' mon 'allow rwx' -i /var/lib/ceph/osd/ceph-{osd-num}/keyring
|
||||
|
||||
|
||||
#. Add the OSD to the CRUSH map so that the OSD can begin receiving data. The
|
||||
``ceph osd crush add`` command allows you to add OSDs to the CRUSH hierarchy
|
||||
wherever you wish. If you specify at least one bucket, the command
|
||||
@ -120,7 +127,9 @@ weight).
|
||||
you specify only the root bucket, the command will attach the OSD directly
|
||||
to the root, but CRUSH rules expect OSDs to be inside of hosts.
|
||||
|
||||
Execute the following::
|
||||
Execute the following:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd crush add {id-or-name} {weight} [{bucket-type}={bucket-name} ...]
|
||||
|
||||
@ -135,34 +144,49 @@ weight).
|
||||
Replacing an OSD
|
||||
----------------
|
||||
|
||||
.. note:: If the instructions in this section do not work for you, try the
|
||||
instructions in the cephadm documentation: :ref:`cephadm-replacing-an-osd`.
|
||||
|
||||
When disks fail, or if an administrator wants to reprovision OSDs with a new
|
||||
backend, for instance, for switching from FileStore to BlueStore, OSDs need to
|
||||
be replaced. Unlike `Removing the OSD`_, replaced OSD's id and CRUSH map entry
|
||||
need to be keep intact after the OSD is destroyed for replacement.
|
||||
|
||||
#. Make sure it is safe to destroy the OSD::
|
||||
#. Make sure it is safe to destroy the OSD:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
while ! ceph osd safe-to-destroy osd.{id} ; do sleep 10 ; done
|
||||
|
||||
#. Destroy the OSD first::
|
||||
#. Destroy the OSD first:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd destroy {id} --yes-i-really-mean-it
|
||||
|
||||
#. Zap a disk for the new OSD, if the disk was used before for other purposes.
|
||||
It's not necessary for a new disk::
|
||||
It's not necessary for a new disk:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph-volume lvm zap /dev/sdX
|
||||
|
||||
#. Prepare the disk for replacement by using the previously destroyed OSD id::
|
||||
#. Prepare the disk for replacement by using the previously destroyed OSD id:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph-volume lvm prepare --osd-id {id} --data /dev/sdX
|
||||
|
||||
#. And activate the OSD::
|
||||
#. And activate the OSD:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph-volume lvm activate {id} {fsid}
|
||||
|
||||
Alternatively, instead of preparing and activating, the device can be recreated
|
||||
in one call, like::
|
||||
in one call, like:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph-volume lvm create --osd-id {id} --data /dev/sdX
|
||||
|
||||
@ -174,7 +198,9 @@ After you add an OSD to Ceph, the OSD is in your configuration. However,
|
||||
it is not yet running. The OSD is ``down`` and ``in``. You must start
|
||||
your new OSD before it can begin receiving data. You may use
|
||||
``service ceph`` from your admin host or start the OSD from its host
|
||||
machine::
|
||||
machine:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
sudo systemctl start ceph-osd@{osd-num}
|
||||
|
||||
@ -187,7 +213,9 @@ Observe the Data Migration
|
||||
|
||||
Once you have added your new OSD to the CRUSH map, Ceph will begin rebalancing
|
||||
the server by migrating placement groups to your new OSD. You can observe this
|
||||
process with the `ceph`_ tool. ::
|
||||
process with the `ceph`_ tool. :
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph -w
|
||||
|
||||
@ -195,7 +223,6 @@ You should see the placement group states change from ``active+clean`` to
|
||||
``active, some degraded objects``, and finally ``active+clean`` when migration
|
||||
completes. (Control-c to exit.)
|
||||
|
||||
|
||||
.. _Add/Move an OSD: ../crush-map#addosd
|
||||
.. _ceph: ../monitoring
|
||||
|
||||
@ -222,7 +249,9 @@ Take the OSD out of the Cluster
|
||||
|
||||
Before you remove an OSD, it is usually ``up`` and ``in``. You need to take it
|
||||
out of the cluster so that Ceph can begin rebalancing and copying its data to
|
||||
other OSDs. ::
|
||||
other OSDs. :
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd out {osd-num}
|
||||
|
||||
@ -232,7 +261,9 @@ Observe the Data Migration
|
||||
|
||||
Once you have taken your OSD ``out`` of the cluster, Ceph will begin
|
||||
rebalancing the cluster by migrating placement groups out of the OSD you
|
||||
removed. You can observe this process with the `ceph`_ tool. ::
|
||||
removed. You can observe this process with the `ceph`_ tool. :
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph -w
|
||||
|
||||
@ -246,12 +277,16 @@ completes. (Control-c to exit.)
|
||||
``active+remapped`` state. If you are in this case, you should mark
|
||||
the OSD ``in`` with:
|
||||
|
||||
``ceph osd in {osd-num}``
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd in {osd-num}
|
||||
|
||||
to come back to the initial state and then, instead of marking ``out``
|
||||
the OSD, set its weight to 0 with:
|
||||
|
||||
``ceph osd crush reweight osd.{osd-num} 0``
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd crush reweight osd.{osd-num} 0
|
||||
|
||||
After that, you can observe the data migration which should come to its
|
||||
end. The difference between marking ``out`` the OSD and reweighting it
|
||||
@ -267,7 +302,9 @@ Stopping the OSD
|
||||
|
||||
After you take an OSD out of the cluster, it may still be running.
|
||||
That is, the OSD may be ``up`` and ``out``. You must stop
|
||||
your OSD before you remove it from the configuration. ::
|
||||
your OSD before you remove it from the configuration:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ssh {osd-host}
|
||||
sudo systemctl stop ceph-osd@{osd-num}
|
||||
@ -286,50 +323,64 @@ OSD for each drive by repeating this procedure.
|
||||
#. Let the cluster forget the OSD first. This step removes the OSD from the CRUSH
|
||||
map, removes its authentication key. And it is removed from the OSD map as
|
||||
well. Please note the :ref:`purge subcommand <ceph-admin-osd>` is introduced in Luminous, for older
|
||||
versions, please see below ::
|
||||
versions, please see below:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd purge {id} --yes-i-really-mean-it
|
||||
|
||||
#. Navigate to the host where you keep the master copy of the cluster's
|
||||
``ceph.conf`` file. ::
|
||||
``ceph.conf`` file:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ssh {admin-host}
|
||||
cd /etc/ceph
|
||||
vim ceph.conf
|
||||
|
||||
#. Remove the OSD entry from your ``ceph.conf`` file (if it exists). ::
|
||||
#. Remove the OSD entry from your ``ceph.conf`` file (if it exists)::
|
||||
|
||||
[osd.1]
|
||||
host = {hostname}
|
||||
|
||||
#. From the host where you keep the master copy of the cluster's ``ceph.conf`` file,
|
||||
copy the updated ``ceph.conf`` file to the ``/etc/ceph`` directory of other
|
||||
hosts in your cluster.
|
||||
#. From the host where you keep the master copy of the cluster's ``ceph.conf``
|
||||
file, copy the updated ``ceph.conf`` file to the ``/etc/ceph`` directory of
|
||||
other hosts in your cluster.
|
||||
|
||||
If your Ceph cluster is older than Luminous, instead of using ``ceph osd purge``,
|
||||
you need to perform this step manually:
|
||||
If your Ceph cluster is older than Luminous, instead of using ``ceph osd
|
||||
purge``, you need to perform this step manually:
|
||||
|
||||
|
||||
#. Remove the OSD from the CRUSH map so that it no longer receives data. You may
|
||||
also decompile the CRUSH map, remove the OSD from the device list, remove the
|
||||
device as an item in the host bucket or remove the host bucket (if it's in the
|
||||
CRUSH map and you intend to remove the host), recompile the map and set it.
|
||||
See `Remove an OSD`_ for details. ::
|
||||
See `Remove an OSD`_ for details:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd crush remove {name}
|
||||
|
||||
#. Remove the OSD authentication key. ::
|
||||
#. Remove the OSD authentication key:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph auth del osd.{osd-num}
|
||||
|
||||
The value of ``ceph`` for ``ceph-{osd-num}`` in the path is the ``$cluster-$id``.
|
||||
If your cluster name differs from ``ceph``, use your cluster name instead.
|
||||
The value of ``ceph`` for ``ceph-{osd-num}`` in the path is the
|
||||
``$cluster-$id``. If your cluster name differs from ``ceph``, use your
|
||||
cluster name instead.
|
||||
|
||||
#. Remove the OSD. ::
|
||||
#. Remove the OSD:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd rm {osd-num}
|
||||
#for example
|
||||
|
||||
for example:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd rm 1
|
||||
|
||||
|
||||
.. _Remove an OSD: ../crush-map#removeosd
|
||||
|
@ -1,4 +1,3 @@
|
||||
|
||||
.. _balancer:
|
||||
|
||||
Balancer
|
||||
@ -11,7 +10,9 @@ supervised fashion.
|
||||
Status
|
||||
------
|
||||
|
||||
The current status of the balancer can be checked at any time with::
|
||||
The current status of the balancer can be checked at any time with:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph balancer status
|
||||
|
||||
@ -21,7 +22,9 @@ Automatic balancing
|
||||
|
||||
The automatic balancing feature is enabled by default in ``upmap``
|
||||
mode. Please refer to :ref:`upmap` for more details. The balancer can be
|
||||
turned off with::
|
||||
turned off with:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph balancer off
|
||||
|
||||
@ -40,35 +43,49 @@ healed itself).
|
||||
When the cluster is healthy, the balancer will throttle its changes
|
||||
such that the percentage of PGs that are misplaced (i.e., that need to
|
||||
be moved) is below a threshold of (by default) 5%. The
|
||||
``target_max_misplaced_ratio`` threshold can be adjusted with::
|
||||
``target_max_misplaced_ratio`` threshold can be adjusted with:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph config set mgr target_max_misplaced_ratio .07 # 7%
|
||||
|
||||
Set the number of seconds to sleep in between runs of the automatic balancer::
|
||||
Set the number of seconds to sleep in between runs of the automatic balancer:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph config set mgr mgr/balancer/sleep_interval 60
|
||||
|
||||
Set the time of day to begin automatic balancing in HHMM format::
|
||||
Set the time of day to begin automatic balancing in HHMM format:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph config set mgr mgr/balancer/begin_time 0000
|
||||
|
||||
Set the time of day to finish automatic balancing in HHMM format::
|
||||
Set the time of day to finish automatic balancing in HHMM format:
|
||||
|
||||
ceph config set mgr mgr/balancer/end_time 2400
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph config set mgr mgr/balancer/end_time 2359
|
||||
|
||||
Restrict automatic balancing to this day of the week or later.
|
||||
Uses the same conventions as crontab, 0 or 7 is Sunday, 1 is Monday, and so on::
|
||||
Uses the same conventions as crontab, 0 is Sunday, 1 is Monday, and so on:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph config set mgr mgr/balancer/begin_weekday 0
|
||||
|
||||
Restrict automatic balancing to this day of the week or earlier.
|
||||
Uses the same conventions as crontab, 0 or 7 is Sunday, 1 is Monday, and so on::
|
||||
Uses the same conventions as crontab, 0 is Sunday, 1 is Monday, and so on:
|
||||
|
||||
ceph config set mgr mgr/balancer/end_weekday 7
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph config set mgr mgr/balancer/end_weekday 6
|
||||
|
||||
Pool IDs to which the automatic balancing will be limited.
|
||||
The default for this is an empty string, meaning all pools will be balanced.
|
||||
The numeric pool IDs can be gotten with the :command:`ceph osd pool ls detail` command::
|
||||
The numeric pool IDs can be gotten with the :command:`ceph osd pool ls detail` command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph config set mgr mgr/balancer/pool_ids 1,2,3
|
||||
|
||||
@ -112,7 +129,9 @@ There are currently two supported balancer modes:
|
||||
|
||||
Note that using upmap requires that all clients be Luminous or newer.
|
||||
|
||||
The default mode is ``upmap``. The mode can be adjusted with::
|
||||
The default mode is ``upmap``. The mode can be adjusted with:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph balancer mode crush-compat
|
||||
|
||||
@ -125,43 +144,63 @@ The balancer operation is broken into a few distinct phases:
|
||||
#. evaluating the quality of the data distribution, either for the current PG distribution, or the PG distribution that would result after executing a *plan*
|
||||
#. executing the *plan*
|
||||
|
||||
To evaluate and score the current distribution::
|
||||
To evaluate and score the current distribution:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph balancer eval
|
||||
|
||||
You can also evaluate the distribution for a single pool with::
|
||||
You can also evaluate the distribution for a single pool with:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph balancer eval <pool-name>
|
||||
|
||||
Greater detail for the evaluation can be seen with::
|
||||
Greater detail for the evaluation can be seen with:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph balancer eval-verbose ...
|
||||
|
||||
The balancer can generate a plan, using the currently configured mode, with::
|
||||
The balancer can generate a plan, using the currently configured mode, with:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph balancer optimize <plan-name>
|
||||
|
||||
The name is provided by the user and can be any useful identifying string. The contents of a plan can be seen with::
|
||||
The name is provided by the user and can be any useful identifying string. The contents of a plan can be seen with:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph balancer show <plan-name>
|
||||
|
||||
All plans can be shown with::
|
||||
All plans can be shown with:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph balancer ls
|
||||
|
||||
Old plans can be discarded with::
|
||||
Old plans can be discarded with:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph balancer rm <plan-name>
|
||||
|
||||
Currently recorded plans are shown as part of the status command::
|
||||
Currently recorded plans are shown as part of the status command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph balancer status
|
||||
|
||||
The quality of the distribution that would result after executing a plan can be calculated with::
|
||||
The quality of the distribution that would result after executing a plan can be calculated with:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph balancer eval <plan-name>
|
||||
|
||||
Assuming the plan is expected to improve the distribution (i.e., it has a lower score than the current cluster state), the user can execute that plan with::
|
||||
Assuming the plan is expected to improve the distribution (i.e., it has a lower score than the current cluster state), the user can execute that plan with:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph balancer execute <plan-name>
|
||||
|
||||
|
@ -2,28 +2,29 @@
|
||||
BlueStore Migration
|
||||
=====================
|
||||
|
||||
Each OSD can run either BlueStore or FileStore, and a single Ceph
|
||||
Each OSD can run either BlueStore or Filestore, and a single Ceph
|
||||
cluster can contain a mix of both. Users who have previously deployed
|
||||
FileStore are likely to want to transition to BlueStore in order to
|
||||
take advantage of the improved performance and robustness. There are
|
||||
Filestore OSDs should transition to BlueStore in order to
|
||||
take advantage of the improved performance and robustness. Moreover,
|
||||
Ceph releases beginning with Reef do not support Filestore. There are
|
||||
several strategies for making such a transition.
|
||||
|
||||
An individual OSD cannot be converted in place in isolation, however:
|
||||
BlueStore and FileStore are simply too different for that to be
|
||||
practical. "Conversion" will rely either on the cluster's normal
|
||||
An individual OSD cannot be converted in place;
|
||||
BlueStore and Filestore are simply too different for that to be
|
||||
feasible. The conversion process uses either the cluster's normal
|
||||
replication and healing support or tools and strategies that copy OSD
|
||||
content from an old (FileStore) device to a new (BlueStore) one.
|
||||
content from an old (Filestore) device to a new (BlueStore) one.
|
||||
|
||||
|
||||
Deploy new OSDs with BlueStore
|
||||
==============================
|
||||
|
||||
Any new OSDs (e.g., when the cluster is expanded) can be deployed
|
||||
New OSDs (e.g., when the cluster is expanded) should be deployed
|
||||
using BlueStore. This is the default behavior so no specific change
|
||||
is needed.
|
||||
|
||||
Similarly, any OSDs that are reprovisioned after replacing a failed drive
|
||||
can use BlueStore.
|
||||
should use BlueStore.
|
||||
|
||||
Convert existing OSDs
|
||||
=====================
|
||||
@ -31,69 +32,96 @@ Convert existing OSDs
|
||||
Mark out and replace
|
||||
--------------------
|
||||
|
||||
The simplest approach is to mark out each device in turn, wait for the
|
||||
The simplest approach is to ensure that the cluster is healthy,
|
||||
then mark ``out`` each device in turn, wait for
|
||||
data to replicate across the cluster, reprovision the OSD, and mark
|
||||
it back in again. It is simple and easy to automate. However, it requires
|
||||
more data migration than should be necessary, so it is not optimal.
|
||||
it back ``in`` again. Proceed to the next OSD when recovery is complete.
|
||||
This is easy to automate but results in more data migration than
|
||||
is strictly necessary, which in turn presents additional wear to SSDs and takes
|
||||
longer to complete.
|
||||
|
||||
#. Identify a FileStore OSD to replace::
|
||||
#. Identify a Filestore OSD to replace::
|
||||
|
||||
ID=<osd-id-number>
|
||||
DEVICE=<disk-device>
|
||||
|
||||
You can tell whether a given OSD is FileStore or BlueStore with::
|
||||
You can tell whether a given OSD is Filestore or BlueStore with:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd metadata $ID | grep osd_objectstore
|
||||
|
||||
You can get a current count of filestore vs bluestore with::
|
||||
You can get a current count of Filestore and BlueStore OSDs with:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd count-metadata osd_objectstore
|
||||
|
||||
#. Mark the filestore OSD out::
|
||||
#. Mark the Filestore OSD ``out``:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd out $ID
|
||||
|
||||
#. Wait for the data to migrate off the OSD in question::
|
||||
#. Wait for the data to migrate off the OSD in question:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
while ! ceph osd safe-to-destroy $ID ; do sleep 60 ; done
|
||||
|
||||
#. Stop the OSD::
|
||||
#. Stop the OSD:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
systemctl kill ceph-osd@$ID
|
||||
|
||||
#. Make note of which device this OSD is using::
|
||||
#. Note which device this OSD is using:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
mount | grep /var/lib/ceph/osd/ceph-$ID
|
||||
|
||||
#. Unmount the OSD::
|
||||
#. Unmount the OSD:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
umount /var/lib/ceph/osd/ceph-$ID
|
||||
|
||||
#. Destroy the OSD data. Be *EXTREMELY CAREFUL* as this will destroy
|
||||
the contents of the device; be certain the data on the device is
|
||||
not needed (i.e., that the cluster is healthy) before proceeding. ::
|
||||
not needed (i.e., that the cluster is healthy) before proceeding:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph-volume lvm zap $DEVICE
|
||||
|
||||
#. Tell the cluster the OSD has been destroyed (and a new OSD can be
|
||||
reprovisioned with the same ID)::
|
||||
reprovisioned with the same ID):
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd destroy $ID --yes-i-really-mean-it
|
||||
|
||||
#. Reprovision a BlueStore OSD in its place with the same OSD ID.
|
||||
#. Provision a BlueStore OSD in its place with the same OSD ID.
|
||||
This requires you do identify which device to wipe based on what you saw
|
||||
mounted above. BE CAREFUL! ::
|
||||
mounted above. BE CAREFUL! Also note that hybrid OSDs may require
|
||||
adjustments to these commands:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph-volume lvm create --bluestore --data $DEVICE --osd-id $ID
|
||||
|
||||
#. Repeat.
|
||||
|
||||
You can allow the refilling of the replacement OSD to happen
|
||||
You can allow balancing of the replacement OSD to happen
|
||||
concurrently with the draining of the next OSD, or follow the same
|
||||
procedure for multiple OSDs in parallel, as long as you ensure the
|
||||
cluster is fully clean (all data has all replicas) before destroying
|
||||
any OSDs. Failure to do so will reduce the redundancy of your data
|
||||
and increase the risk of (or potentially even cause) data loss.
|
||||
any OSDs. If you reprovision multiple OSDs in parallel, be **very** careful to
|
||||
only zap / destroy OSDs within a single CRUSH failure domain, e.g. ``host`` or
|
||||
``rack``. Failure to do so will reduce the redundancy and availability of
|
||||
your data and increase the risk of (or even cause) data loss.
|
||||
|
||||
|
||||
Advantages:
|
||||
|
||||
@ -116,30 +144,34 @@ to evacuate an entire host in order to use it as a spare, then the
|
||||
conversion can be done on a host-by-host basis with each stored copy of
|
||||
the data migrating only once.
|
||||
|
||||
First, you need have empty host that has no data. There are two ways to do this: either by starting with a new, empty host that isn't yet part of the cluster, or by offloading data from an existing host that in the cluster.
|
||||
First, you need an empty host that has no OSDs provisioned. There are two
|
||||
ways to do this: either by starting with a new, empty host that isn't yet
|
||||
part of the cluster, or by offloading data from an existing host in the cluster.
|
||||
|
||||
Use a new, empty host
|
||||
^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
Ideally the host should have roughly the
|
||||
same capacity as other hosts you will be converting (although it
|
||||
doesn't strictly matter). ::
|
||||
same capacity as other hosts you will be converting.
|
||||
Add the host to the CRUSH hierarchy, but do not attach it to the root:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
NEWHOST=<empty-host-name>
|
||||
|
||||
Add the host to the CRUSH hierarchy, but do not attach it to the root::
|
||||
|
||||
ceph osd crush add-bucket $NEWHOST host
|
||||
|
||||
Make sure the ceph packages are installed.
|
||||
Make sure that Ceph packages are installed on the new host.
|
||||
|
||||
Use an existing host
|
||||
^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
If you would like to use an existing host
|
||||
that is already part of the cluster, and there is sufficient free
|
||||
space on that host so that all of its data can be migrated off,
|
||||
then you can instead do::
|
||||
space on that host so that all of its data can be migrated off to
|
||||
other cluster hosts, you can instead do::
|
||||
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
OLDHOST=<existing-cluster-host-to-offload>
|
||||
ceph osd crush unlink $OLDHOST default
|
||||
@ -147,9 +179,14 @@ then you can instead do::
|
||||
where "default" is the immediate ancestor in the CRUSH map. (For
|
||||
smaller clusters with unmodified configurations this will normally
|
||||
be "default", but it might also be a rack name.) You should now
|
||||
see the host at the top of the OSD tree output with no parent::
|
||||
see the host at the top of the OSD tree output with no parent:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
bin/ceph osd tree
|
||||
|
||||
::
|
||||
|
||||
$ bin/ceph osd tree
|
||||
ID CLASS WEIGHT TYPE NAME STATUS REWEIGHT PRI-AFF
|
||||
-5 0 host oldhost
|
||||
10 ssd 1.00000 osd.10 up 1.00000 1.00000
|
||||
@ -172,11 +209,15 @@ Migration process
|
||||
If you're using a new host, start at step #1. For an existing host,
|
||||
jump to step #5 below.
|
||||
|
||||
#. Provision new BlueStore OSDs for all devices::
|
||||
#. Provision new BlueStore OSDs for all devices:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph-volume lvm create --bluestore --data /dev/$DEVICE
|
||||
|
||||
#. Verify OSDs join the cluster with::
|
||||
#. Verify OSDs join the cluster with:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd tree
|
||||
|
||||
@ -198,11 +239,15 @@ jump to step #5 below.
|
||||
2 ssd 1.00000 osd.2 up 1.00000 1.00000
|
||||
...
|
||||
|
||||
#. Identify the first target host to convert ::
|
||||
#. Identify the first target host to convert :
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
OLDHOST=<existing-cluster-host-to-convert>
|
||||
|
||||
#. Swap the new host into the old host's position in the cluster::
|
||||
#. Swap the new host into the old host's position in the cluster:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd crush swap-bucket $NEWHOST $OLDHOST
|
||||
|
||||
@ -212,29 +257,39 @@ jump to step #5 below.
|
||||
other nodes in the cluster, but as long as the hosts are similarly
|
||||
sized this will be a relatively small amount of data.
|
||||
|
||||
#. Wait for data migration to complete::
|
||||
#. Wait for data migration to complete:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
while ! ceph osd safe-to-destroy $(ceph osd ls-tree $OLDHOST); do sleep 60 ; done
|
||||
|
||||
#. Stop all old OSDs on the now-empty ``$OLDHOST``::
|
||||
#. Stop all old OSDs on the now-empty ``$OLDHOST``:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ssh $OLDHOST
|
||||
systemctl kill ceph-osd.target
|
||||
umount /var/lib/ceph/osd/ceph-*
|
||||
|
||||
#. Destroy and purge the old OSDs::
|
||||
#. Destroy and purge the old OSDs:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
for osd in `ceph osd ls-tree $OLDHOST`; do
|
||||
ceph osd purge $osd --yes-i-really-mean-it
|
||||
done
|
||||
|
||||
#. Wipe the old OSD devices. This requires you do identify which
|
||||
devices are to be wiped manually (BE CAREFUL!). For each device,::
|
||||
devices are to be wiped manually (BE CAREFUL!). For each device:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph-volume lvm zap $DEVICE
|
||||
|
||||
#. Use the now-empty host as the new host, and repeat::
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
NEWHOST=$OLDHOST
|
||||
|
||||
Advantages:
|
||||
@ -248,7 +303,7 @@ Disadvantages:
|
||||
|
||||
* A spare host is required.
|
||||
* An entire host's worth of OSDs will be migrating data at a time. This
|
||||
is like likely to impact overall cluster performance.
|
||||
is likely to impact overall cluster performance.
|
||||
* All migrated data still makes one full hop over the network.
|
||||
|
||||
|
||||
@ -258,13 +313,13 @@ Per-OSD device copy
|
||||
A single logical OSD can be converted by using the ``copy`` function
|
||||
of ``ceph-objectstore-tool``. This requires that the host have a free
|
||||
device (or devices) to provision a new, empty BlueStore OSD. For
|
||||
example, if each host in your cluster has 12 OSDs, then you'd need a
|
||||
13th available device so that each OSD can be converted in turn before the
|
||||
example, if each host in your cluster has twelve OSDs, then you'd need a
|
||||
thirteenth unused device so that each OSD can be converted in turn before the
|
||||
old device is reclaimed to convert the next OSD.
|
||||
|
||||
Caveats:
|
||||
|
||||
* This strategy requires that a blank BlueStore OSD be prepared
|
||||
* This strategy requires that an empty BlueStore OSD be prepared
|
||||
without allocating a new OSD ID, something that the ``ceph-volume``
|
||||
tool doesn't support. More importantly, the setup of *dmcrypt* is
|
||||
closely tied to the OSD identity, which means that this approach
|
||||
|
@ -45,16 +45,22 @@ and the backing storage tier automatically. However, admins have the ability to
|
||||
configure how this migration takes place by setting the ``cache-mode``. There are
|
||||
two main scenarios:
|
||||
|
||||
- **writeback** mode: When admins configure tiers with ``writeback`` mode, Ceph
|
||||
clients write data to the cache tier and receive an ACK from the cache tier.
|
||||
In time, the data written to the cache tier migrates to the storage tier
|
||||
and gets flushed from the cache tier. Conceptually, the cache tier is
|
||||
overlaid "in front" of the backing storage tier. When a Ceph client needs
|
||||
data that resides in the storage tier, the cache tiering agent migrates the
|
||||
data to the cache tier on read, then it is sent to the Ceph client.
|
||||
Thereafter, the Ceph client can perform I/O using the cache tier, until the
|
||||
data becomes inactive. This is ideal for mutable data (e.g., photo/video
|
||||
editing, transactional data, etc.).
|
||||
- **writeback** mode: If the base tier and the cache tier are configured in
|
||||
``writeback`` mode, Ceph clients receive an ACK from the base tier every time
|
||||
they write data to it. Then the cache tiering agent determines whether
|
||||
``osd_tier_default_cache_min_write_recency_for_promote`` has been set. If it
|
||||
has been set and the data has been written more than a specified number of
|
||||
times per interval, the data is promoted to the cache tier.
|
||||
|
||||
When Ceph clients need access to data stored in the base tier, the cache
|
||||
tiering agent reads the data from the base tier and returns it to the client.
|
||||
While data is being read from the base tier, the cache tiering agent consults
|
||||
the value of ``osd_tier_default_cache_min_read_recency_for_promote`` and
|
||||
decides whether to promote that data from the base tier to the cache tier.
|
||||
When data has been promoted from the base tier to the cache tier, the Ceph
|
||||
client is able to perform I/O operations on it using the cache tier. This is
|
||||
well-suited for mutable data (for example, photo/video editing, transactional
|
||||
data).
|
||||
|
||||
- **readproxy** mode: This mode will use any objects that already
|
||||
exist in the cache tier, but if an object is not present in the
|
||||
@ -199,30 +205,42 @@ Creating a Cache Tier
|
||||
=====================
|
||||
|
||||
Setting up a cache tier involves associating a backing storage pool with
|
||||
a cache pool ::
|
||||
a cache pool:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd tier add {storagepool} {cachepool}
|
||||
|
||||
For example ::
|
||||
For example:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd tier add cold-storage hot-storage
|
||||
|
||||
To set the cache mode, execute the following::
|
||||
To set the cache mode, execute the following:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd tier cache-mode {cachepool} {cache-mode}
|
||||
|
||||
For example::
|
||||
For example:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd tier cache-mode hot-storage writeback
|
||||
|
||||
The cache tiers overlay the backing storage tier, so they require one
|
||||
additional step: you must direct all client traffic from the storage pool to
|
||||
the cache pool. To direct client traffic directly to the cache pool, execute
|
||||
the following::
|
||||
the following:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd tier set-overlay {storagepool} {cachepool}
|
||||
|
||||
For example::
|
||||
For example:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd tier set-overlay cold-storage hot-storage
|
||||
|
||||
@ -231,7 +249,9 @@ Configuring a Cache Tier
|
||||
========================
|
||||
|
||||
Cache tiers have several configuration options. You may set
|
||||
cache tier configuration options with the following usage::
|
||||
cache tier configuration options with the following usage:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd pool set {cachepool} {key} {value}
|
||||
|
||||
@ -241,16 +261,22 @@ See `Pools - Set Pool Values`_ for details.
|
||||
Target Size and Type
|
||||
--------------------
|
||||
|
||||
Ceph's production cache tiers use a `Bloom Filter`_ for the ``hit_set_type``::
|
||||
Ceph's production cache tiers use a `Bloom Filter`_ for the ``hit_set_type``:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd pool set {cachepool} hit_set_type bloom
|
||||
|
||||
For example::
|
||||
For example:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd pool set hot-storage hit_set_type bloom
|
||||
|
||||
The ``hit_set_count`` and ``hit_set_period`` define how many such HitSets to
|
||||
store, and how much time each HitSet should cover. ::
|
||||
store, and how much time each HitSet should cover:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd pool set {cachepool} hit_set_count 12
|
||||
ceph osd pool set {cachepool} hit_set_period 14400
|
||||
@ -273,7 +299,9 @@ number of archive HitSets are checked. The object is promoted if the object is
|
||||
found in any of the most recent ``min_read_recency_for_promote`` HitSets.
|
||||
|
||||
A similar parameter can be set for the write operation, which is
|
||||
``min_write_recency_for_promote``. ::
|
||||
``min_write_recency_for_promote``:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd pool set {cachepool} min_read_recency_for_promote 2
|
||||
ceph osd pool set {cachepool} min_write_recency_for_promote 2
|
||||
@ -303,20 +331,27 @@ Absolute Sizing
|
||||
|
||||
The cache tiering agent can flush or evict objects based upon the total number
|
||||
of bytes or the total number of objects. To specify a maximum number of bytes,
|
||||
execute the following::
|
||||
execute the following:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd pool set {cachepool} target_max_bytes {#bytes}
|
||||
|
||||
For example, to flush or evict at 1 TB, execute the following::
|
||||
For example, to flush or evict at 1 TB, execute the following:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd pool set hot-storage target_max_bytes 1099511627776
|
||||
|
||||
To specify the maximum number of objects, execute the following:
|
||||
|
||||
To specify the maximum number of objects, execute the following::
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd pool set {cachepool} target_max_objects {#objects}
|
||||
|
||||
For example, to flush or evict at 1M objects, execute the following::
|
||||
For example, to flush or evict at 1M objects, execute the following:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd pool set hot-storage target_max_objects 1000000
|
||||
|
||||
@ -335,34 +370,46 @@ The cache tiering agent can flush or evict objects relative to the size of the
|
||||
cache pool(specified by ``target_max_bytes`` / ``target_max_objects`` in
|
||||
`Absolute sizing`_). When the cache pool consists of a certain percentage of
|
||||
modified (or dirty) objects, the cache tiering agent will flush them to the
|
||||
storage pool. To set the ``cache_target_dirty_ratio``, execute the following::
|
||||
storage pool. To set the ``cache_target_dirty_ratio``, execute the following:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd pool set {cachepool} cache_target_dirty_ratio {0.0..1.0}
|
||||
|
||||
For example, setting the value to ``0.4`` will begin flushing modified
|
||||
(dirty) objects when they reach 40% of the cache pool's capacity::
|
||||
(dirty) objects when they reach 40% of the cache pool's capacity:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd pool set hot-storage cache_target_dirty_ratio 0.4
|
||||
|
||||
When the dirty objects reaches a certain percentage of its capacity, flush dirty
|
||||
objects with a higher speed. To set the ``cache_target_dirty_high_ratio``::
|
||||
objects with a higher speed. To set the ``cache_target_dirty_high_ratio``:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd pool set {cachepool} cache_target_dirty_high_ratio {0.0..1.0}
|
||||
|
||||
For example, setting the value to ``0.6`` will begin aggressively flush dirty objects
|
||||
when they reach 60% of the cache pool's capacity. obviously, we'd better set the value
|
||||
between dirty_ratio and full_ratio::
|
||||
For example, setting the value to ``0.6`` will begin aggressively flush dirty
|
||||
objects when they reach 60% of the cache pool's capacity. obviously, we'd
|
||||
better set the value between dirty_ratio and full_ratio:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd pool set hot-storage cache_target_dirty_high_ratio 0.6
|
||||
|
||||
When the cache pool reaches a certain percentage of its capacity, the cache
|
||||
tiering agent will evict objects to maintain free capacity. To set the
|
||||
``cache_target_full_ratio``, execute the following::
|
||||
``cache_target_full_ratio``, execute the following:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd pool set {cachepool} cache_target_full_ratio {0.0..1.0}
|
||||
|
||||
For example, setting the value to ``0.8`` will begin flushing unmodified
|
||||
(clean) objects when they reach 80% of the cache pool's capacity::
|
||||
(clean) objects when they reach 80% of the cache pool's capacity:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd pool set hot-storage cache_target_full_ratio 0.8
|
||||
|
||||
@ -371,21 +418,29 @@ Cache Age
|
||||
---------
|
||||
|
||||
You can specify the minimum age of an object before the cache tiering agent
|
||||
flushes a recently modified (or dirty) object to the backing storage pool::
|
||||
flushes a recently modified (or dirty) object to the backing storage pool:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd pool set {cachepool} cache_min_flush_age {#seconds}
|
||||
|
||||
For example, to flush modified (or dirty) objects after 10 minutes, execute
|
||||
the following::
|
||||
For example, to flush modified (or dirty) objects after 10 minutes, execute the
|
||||
following:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd pool set hot-storage cache_min_flush_age 600
|
||||
|
||||
You can specify the minimum age of an object before it will be evicted from
|
||||
the cache tier::
|
||||
You can specify the minimum age of an object before it will be evicted from the
|
||||
cache tier:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd pool {cache-tier} cache_min_evict_age {#seconds}
|
||||
|
||||
For example, to evict objects after 30 minutes, execute the following::
|
||||
For example, to evict objects after 30 minutes, execute the following:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd pool set hot-storage cache_min_evict_age 1800
|
||||
|
||||
@ -403,24 +458,31 @@ Removing a Read-Only Cache
|
||||
Since a read-only cache does not have modified data, you can disable
|
||||
and remove it without losing any recent changes to objects in the cache.
|
||||
|
||||
#. Change the cache-mode to ``none`` to disable it. ::
|
||||
#. Change the cache-mode to ``none`` to disable it.:
|
||||
|
||||
.. prompt:: bash
|
||||
|
||||
ceph osd tier cache-mode {cachepool} none
|
||||
|
||||
For example::
|
||||
For example:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd tier cache-mode hot-storage none
|
||||
|
||||
#. Remove the cache pool from the backing pool. ::
|
||||
#. Remove the cache pool from the backing pool.:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd tier remove {storagepool} {cachepool}
|
||||
|
||||
For example::
|
||||
For example:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd tier remove cold-storage hot-storage
|
||||
|
||||
|
||||
|
||||
Removing a Writeback Cache
|
||||
--------------------------
|
||||
|
||||
@ -430,39 +492,55 @@ disable and remove it.
|
||||
|
||||
|
||||
#. Change the cache mode to ``proxy`` so that new and modified objects will
|
||||
flush to the backing storage pool. ::
|
||||
flush to the backing storage pool.:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd tier cache-mode {cachepool} proxy
|
||||
|
||||
For example::
|
||||
For example:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd tier cache-mode hot-storage proxy
|
||||
|
||||
|
||||
#. Ensure that the cache pool has been flushed. This may take a few minutes::
|
||||
#. Ensure that the cache pool has been flushed. This may take a few minutes:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
rados -p {cachepool} ls
|
||||
|
||||
If the cache pool still has objects, you can flush them manually.
|
||||
For example::
|
||||
For example:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
rados -p {cachepool} cache-flush-evict-all
|
||||
|
||||
|
||||
#. Remove the overlay so that clients will not direct traffic to the cache. ::
|
||||
#. Remove the overlay so that clients will not direct traffic to the cache.:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd tier remove-overlay {storagetier}
|
||||
|
||||
For example::
|
||||
For example:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd tier remove-overlay cold-storage
|
||||
|
||||
|
||||
#. Finally, remove the cache tier pool from the backing storage pool. ::
|
||||
#. Finally, remove the cache tier pool from the backing storage pool.:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd tier remove {storagepool} {cachepool}
|
||||
|
||||
For example::
|
||||
For example:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd tier remove cold-storage hot-storage
|
||||
|
||||
|
@ -30,18 +30,24 @@ This mode lets you mark monitors as disallowd, in which case they will
|
||||
participate in the quorum and serve clients, but cannot be elected leader. You
|
||||
may wish to use this if you have some monitors which are known to be far away
|
||||
from clients.
|
||||
You can disallow a leader by running ::
|
||||
You can disallow a leader by running:
|
||||
|
||||
$ ceph mon add disallowed_leader {name}
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph mon add disallowed_leader {name}
|
||||
|
||||
You can remove a monitor from the disallowed list, and allow it to become
|
||||
a leader again, by running ::
|
||||
a leader again, by running:
|
||||
|
||||
$ ceph mon rm disallowed_leader {name}
|
||||
.. prompt:: bash $
|
||||
|
||||
The list of disallowed_leaders is included when you run ::
|
||||
ceph mon rm disallowed_leader {name}
|
||||
|
||||
$ ceph mon dump
|
||||
The list of disallowed_leaders is included when you run:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph mon dump
|
||||
|
||||
The connectivity Mode
|
||||
=====================
|
||||
@ -58,7 +64,9 @@ Examining connectivity scores
|
||||
=============================
|
||||
The monitors maintain connection scores even if they aren't in
|
||||
the connectivity election mode. You can examine the scores a monitor
|
||||
has by running ::
|
||||
has by running:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph daemon mon.{name} connection scores dump
|
||||
|
||||
@ -68,7 +76,9 @@ whether it returned its latest ping within the timeout).
|
||||
|
||||
While this would be an unexpected occurrence, if for some reason you experience
|
||||
problems and troubleshooting makes you think your scores have become invalid,
|
||||
you can forget history and reset them by running ::
|
||||
you can forget history and reset them by running:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph daemon mon.{name} connection scores reset
|
||||
|
||||
|
@ -8,11 +8,15 @@
|
||||
Monitor Commands
|
||||
================
|
||||
|
||||
Monitor commands are issued using the ``ceph`` utility::
|
||||
Monitor commands are issued using the ``ceph`` utility:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph [-m monhost] {command}
|
||||
|
||||
The command is usually (though not always) of the form::
|
||||
The command is usually (though not always) of the form:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph {subsystem} {command}
|
||||
|
||||
@ -20,24 +24,32 @@ The command is usually (though not always) of the form::
|
||||
System Commands
|
||||
===============
|
||||
|
||||
Execute the following to display the current cluster status. ::
|
||||
Execute the following to display the current cluster status. :
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph -s
|
||||
ceph status
|
||||
|
||||
Execute the following to display a running summary of cluster status
|
||||
and major events. ::
|
||||
and major events. :
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph -w
|
||||
|
||||
Execute the following to show the monitor quorum, including which monitors are
|
||||
participating and which one is the leader. ::
|
||||
participating and which one is the leader. :
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph mon stat
|
||||
ceph quorum_status
|
||||
|
||||
Execute the following to query the status of a single monitor, including whether
|
||||
or not it is in the quorum. ::
|
||||
or not it is in the quorum. :
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph tell mon.[id] mon_status
|
||||
|
||||
@ -47,11 +59,15 @@ where the value of ``[id]`` can be determined, e.g., from ``ceph -s``.
|
||||
Authentication Subsystem
|
||||
========================
|
||||
|
||||
To add a keyring for an OSD, execute the following::
|
||||
To add a keyring for an OSD, execute the following:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph auth add {osd} {--in-file|-i} {path-to-osd-keyring}
|
||||
|
||||
To list the cluster's keys and their capabilities, execute the following::
|
||||
To list the cluster's keys and their capabilities, execute the following:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph auth ls
|
||||
|
||||
@ -59,7 +75,9 @@ To list the cluster's keys and their capabilities, execute the following::
|
||||
Placement Group Subsystem
|
||||
=========================
|
||||
|
||||
To display the statistics for all placement groups (PGs), execute the following::
|
||||
To display the statistics for all placement groups (PGs), execute the following:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph pg dump [--format {format}]
|
||||
|
||||
@ -70,7 +88,9 @@ less variable from release to release. The ``jq`` utility can be invaluable whe
|
||||
data from JSON output.
|
||||
|
||||
To display the statistics for all placement groups stuck in a specified state,
|
||||
execute the following::
|
||||
execute the following:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph pg dump_stuck inactive|unclean|stale|undersized|degraded [--format {format}] [-t|--threshold {seconds}]
|
||||
|
||||
@ -90,7 +110,9 @@ reported to the monitor cluster in a while (configured by
|
||||
``mon_osd_report_timeout``).
|
||||
|
||||
Delete "lost" objects or revert them to their prior state, either a previous version
|
||||
or delete them if they were just created. ::
|
||||
or delete them if they were just created. :
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph pg {pgid} mark_unfound_lost revert|delete
|
||||
|
||||
@ -100,102 +122,146 @@ or delete them if they were just created. ::
|
||||
OSD Subsystem
|
||||
=============
|
||||
|
||||
Query OSD subsystem status. ::
|
||||
Query OSD subsystem status. :
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd stat
|
||||
|
||||
Write a copy of the most recent OSD map to a file. See
|
||||
:ref:`osdmaptool <osdmaptool>`. ::
|
||||
:ref:`osdmaptool <osdmaptool>`. :
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd getmap -o file
|
||||
|
||||
Write a copy of the crush map from the most recent OSD map to
|
||||
file. ::
|
||||
file. :
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd getcrushmap -o file
|
||||
|
||||
The foregoing is functionally equivalent to ::
|
||||
The foregoing is functionally equivalent to :
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd getmap -o /tmp/osdmap
|
||||
osdmaptool /tmp/osdmap --export-crush file
|
||||
|
||||
Dump the OSD map. Valid formats for ``-f`` are ``plain``, ``json``, ``json-pretty``,
|
||||
``xml``, and ``xml-pretty``. If no ``--format`` option is given, the OSD map is
|
||||
dumped as plain text. As above, JSON format is best for tools, scripting, and other automation. ::
|
||||
dumped as plain text. As above, JSON format is best for tools, scripting, and other automation. :
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd dump [--format {format}]
|
||||
|
||||
Dump the OSD map as a tree with one line per OSD containing weight
|
||||
and state. ::
|
||||
and state. :
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd tree [--format {format}]
|
||||
|
||||
Find out where a specific object is or would be stored in the system::
|
||||
Find out where a specific object is or would be stored in the system:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd map <pool-name> <object-name>
|
||||
|
||||
Add or move a new item (OSD) with the given id/name/weight at the specified
|
||||
location. ::
|
||||
location. :
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd crush set {id} {weight} [{loc1} [{loc2} ...]]
|
||||
|
||||
Remove an existing item (OSD) from the CRUSH map. ::
|
||||
Remove an existing item (OSD) from the CRUSH map. :
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd crush remove {name}
|
||||
|
||||
Remove an existing bucket from the CRUSH map. ::
|
||||
Remove an existing bucket from the CRUSH map. :
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd crush remove {bucket-name}
|
||||
|
||||
Move an existing bucket from one position in the hierarchy to another. ::
|
||||
Move an existing bucket from one position in the hierarchy to another. :
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd crush move {id} {loc1} [{loc2} ...]
|
||||
|
||||
Set the weight of the item given by ``{name}`` to ``{weight}``. ::
|
||||
Set the weight of the item given by ``{name}`` to ``{weight}``. :
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd crush reweight {name} {weight}
|
||||
|
||||
Mark an OSD as ``lost``. This may result in permanent data loss. Use with caution. ::
|
||||
Mark an OSD as ``lost``. This may result in permanent data loss. Use with caution. :
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd lost {id} [--yes-i-really-mean-it]
|
||||
|
||||
Create a new OSD. If no UUID is given, it will be set automatically when the OSD
|
||||
starts up. ::
|
||||
starts up. :
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd create [{uuid}]
|
||||
|
||||
Remove the given OSD(s). ::
|
||||
Remove the given OSD(s). :
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd rm [{id}...]
|
||||
|
||||
Query the current ``max_osd`` parameter in the OSD map. ::
|
||||
Query the current ``max_osd`` parameter in the OSD map. :
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd getmaxosd
|
||||
|
||||
Import the given crush map. ::
|
||||
Import the given crush map. :
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd setcrushmap -i file
|
||||
|
||||
Set the ``max_osd`` parameter in the OSD map. This defaults to 10000 now so
|
||||
most admins will never need to adjust this. ::
|
||||
most admins will never need to adjust this. :
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd setmaxosd
|
||||
|
||||
Mark OSD ``{osd-num}`` down. ::
|
||||
Mark OSD ``{osd-num}`` down. :
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd down {osd-num}
|
||||
|
||||
Mark OSD ``{osd-num}`` out of the distribution (i.e. allocated no data). ::
|
||||
Mark OSD ``{osd-num}`` out of the distribution (i.e. allocated no data). :
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd out {osd-num}
|
||||
|
||||
Mark ``{osd-num}`` in the distribution (i.e. allocated data). ::
|
||||
Mark ``{osd-num}`` in the distribution (i.e. allocated data). :
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd in {osd-num}
|
||||
|
||||
Set or clear the pause flags in the OSD map. If set, no IO requests
|
||||
will be sent to any OSD. Clearing the flags via unpause results in
|
||||
resending pending requests. ::
|
||||
resending pending requests. :
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd pause
|
||||
ceph osd unpause
|
||||
@ -209,7 +275,9 @@ otherwise live on this drive. It does not change weights assigned
|
||||
to the buckets above the OSD in the crush map, and is a corrective
|
||||
measure in case the normal CRUSH distribution is not working out quite
|
||||
right. For instance, if one of your OSDs is at 90% and the others are
|
||||
at 50%, you could reduce this weight to compensate. ::
|
||||
at 50%, you could reduce this weight to compensate. :
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd reweight {osd-num} {weight}
|
||||
|
||||
@ -219,7 +287,9 @@ default to 1.00000 and are relative only to each other; they not absolute.
|
||||
It is crucial to distinguish them from CRUSH weights, which reflect the
|
||||
absolute capacity of a bucket in TiB. By default this command adjusts
|
||||
override weight on OSDs which have + or - 20% of the average utilization,
|
||||
but if you include a ``threshold`` that percentage will be used instead. ::
|
||||
but if you include a ``threshold`` that percentage will be used instead. :
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd reweight-by-utilization [threshold [max_change [max_osds]]] [--no-increasing]
|
||||
|
||||
@ -230,7 +300,9 @@ parameters can speed leveling of OSD utilization, at the potential cost of
|
||||
greater impact on client operations due to more data moving at once.
|
||||
|
||||
To determine which and how many PGs and OSDs will be affected by a given invocation
|
||||
you can test before executing. ::
|
||||
you can test before executing. :
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd test-reweight-by-utilization [threshold [max_change max_osds]] [--no-increasing]
|
||||
|
||||
@ -256,23 +328,31 @@ including the ``range`` keyword.
|
||||
|
||||
These commands are mostly only useful for failure testing, as
|
||||
blocklists are normally maintained automatically and shouldn't need
|
||||
manual intervention. ::
|
||||
manual intervention. :
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd blocklist ["range"] add ADDRESS[:source_port][/netmask_bits] [TIME]
|
||||
ceph osd blocklist ["range"] rm ADDRESS[:source_port][/netmask_bits]
|
||||
|
||||
Creates/deletes a snapshot of a pool. ::
|
||||
Creates/deletes a snapshot of a pool. :
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd pool mksnap {pool-name} {snap-name}
|
||||
ceph osd pool rmsnap {pool-name} {snap-name}
|
||||
|
||||
Creates/deletes/renames a storage pool. ::
|
||||
Creates/deletes/renames a storage pool. :
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd pool create {pool-name} [pg_num [pgp_num]]
|
||||
ceph osd pool delete {pool-name} [{pool-name} --yes-i-really-really-mean-it]
|
||||
ceph osd pool rename {old-name} {new-name}
|
||||
|
||||
Changes a pool setting. ::
|
||||
Changes a pool setting. :
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd pool set {pool-name} {field} {value}
|
||||
|
||||
@ -283,7 +363,9 @@ Valid fields are:
|
||||
* ``pgp_num``: Effective number when calculating pg placement.
|
||||
* ``crush_rule``: rule number for mapping placement.
|
||||
|
||||
Get the value of a pool setting. ::
|
||||
Get the value of a pool setting. :
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd pool get {pool-name} {field}
|
||||
|
||||
@ -293,11 +375,15 @@ Valid fields are:
|
||||
* ``pgp_num``: Effective number of placement groups when calculating placement.
|
||||
|
||||
|
||||
Sends a scrub command to OSD ``{osd-num}``. To send the command to all OSDs, use ``*``. ::
|
||||
Sends a scrub command to OSD ``{osd-num}``. To send the command to all OSDs, use ``*``. :
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd scrub {osd-num}
|
||||
|
||||
Sends a repair command to OSD.N. To send the command to all OSDs, use ``*``. ::
|
||||
Sends a repair command to OSD.N. To send the command to all OSDs, use ``*``. :
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd repair N
|
||||
|
||||
@ -306,34 +392,48 @@ in write requests of ``BYTES_PER_WRITE`` each. By default, the test
|
||||
writes 1 GB in total in 4-MB increments.
|
||||
The benchmark is non-destructive and will not overwrite existing live
|
||||
OSD data, but might temporarily affect the performance of clients
|
||||
concurrently accessing the OSD. ::
|
||||
concurrently accessing the OSD. :
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph tell osd.N bench [TOTAL_DATA_BYTES] [BYTES_PER_WRITE]
|
||||
|
||||
To clear an OSD's caches between benchmark runs, use the 'cache drop' command ::
|
||||
To clear an OSD's caches between benchmark runs, use the 'cache drop' command :
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph tell osd.N cache drop
|
||||
|
||||
To get the cache statistics of an OSD, use the 'cache status' command ::
|
||||
To get the cache statistics of an OSD, use the 'cache status' command :
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph tell osd.N cache status
|
||||
|
||||
MDS Subsystem
|
||||
=============
|
||||
|
||||
Change configuration parameters on a running mds. ::
|
||||
Change configuration parameters on a running mds. :
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph tell mds.{mds-id} config set {setting} {value}
|
||||
|
||||
Example::
|
||||
Example:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph tell mds.0 config set debug_ms 1
|
||||
|
||||
Enables debug messages. ::
|
||||
Enables debug messages. :
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph mds stat
|
||||
|
||||
Displays the status of all metadata servers. ::
|
||||
Displays the status of all metadata servers. :
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph mds fail 0
|
||||
|
||||
@ -345,16 +445,22 @@ Marks the active MDS as failed, triggering failover to a standby if present.
|
||||
Mon Subsystem
|
||||
=============
|
||||
|
||||
Show monitor stats::
|
||||
Show monitor stats:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph mon stat
|
||||
|
||||
::
|
||||
|
||||
e2: 3 mons at {a=127.0.0.1:40000/0,b=127.0.0.1:40001/0,c=127.0.0.1:40002/0}, election epoch 6, quorum 0,1,2 a,b,c
|
||||
|
||||
|
||||
The ``quorum`` list at the end lists monitor nodes that are part of the current quorum.
|
||||
|
||||
This is also available more directly::
|
||||
This is also available more directly:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph quorum_status -f json-pretty
|
||||
|
||||
@ -410,7 +516,9 @@ This is also available more directly::
|
||||
|
||||
The above will block until a quorum is reached.
|
||||
|
||||
For a status of just a single monitor::
|
||||
For a status of just a single monitor:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph tell mon.[name] mon_status
|
||||
|
||||
@ -474,10 +582,14 @@ output::
|
||||
}
|
||||
}
|
||||
|
||||
A dump of the monitor state::
|
||||
A dump of the monitor state:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph mon dump
|
||||
|
||||
::
|
||||
|
||||
dumped monmap epoch 2
|
||||
epoch 2
|
||||
fsid ba807e74-b64f-4b72-b43f-597dfe60ddbc
|
||||
|
@ -35,7 +35,9 @@ Pool Values`_.
|
||||
Get a CRUSH Map
|
||||
---------------
|
||||
|
||||
To get the CRUSH map for your cluster, execute the following::
|
||||
To get the CRUSH map for your cluster, execute the following:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd getcrushmap -o {compiled-crushmap-filename}
|
||||
|
||||
@ -48,7 +50,9 @@ edit it.
|
||||
Decompile a CRUSH Map
|
||||
---------------------
|
||||
|
||||
To decompile a CRUSH map, execute the following::
|
||||
To decompile a CRUSH map, execute the following:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
crushtool -d {compiled-crushmap-filename} -o {decompiled-crushmap-filename}
|
||||
|
||||
@ -57,7 +61,9 @@ To decompile a CRUSH map, execute the following::
|
||||
Recompile a CRUSH Map
|
||||
---------------------
|
||||
|
||||
To compile a CRUSH map, execute the following::
|
||||
To compile a CRUSH map, execute the following:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
crushtool -c {decompiled-crushmap-filename} -o {compiled-crushmap-filename}
|
||||
|
||||
@ -66,7 +72,9 @@ To compile a CRUSH map, execute the following::
|
||||
Set the CRUSH Map
|
||||
-----------------
|
||||
|
||||
To set the CRUSH map for your cluster, execute the following::
|
||||
To set the CRUSH map for your cluster, execute the following:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd setcrushmap -i {compiled-crushmap-filename}
|
||||
|
||||
@ -118,14 +126,22 @@ Devices may also have a *device class* associated with them (e.g.,
|
||||
``hdd`` or ``ssd``), allowing them to be conveniently targeted by a
|
||||
crush rule.
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
devices
|
||||
|
||||
::
|
||||
|
||||
# devices
|
||||
device {num} {osd.name} [class {class}]
|
||||
|
||||
For example::
|
||||
For example:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
devices
|
||||
|
||||
::
|
||||
|
||||
# devices
|
||||
device 0 osd.0 class ssd
|
||||
device 1 osd.1 class hdd
|
||||
device 2 osd.2
|
||||
@ -136,10 +152,6 @@ is normally a single storage device, a pair of devices (for example,
|
||||
one for data and one for a journal or metadata), or in some cases a
|
||||
small RAID device.
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
CRUSH Map Bucket Types
|
||||
----------------------
|
||||
|
||||
@ -157,7 +169,7 @@ media.
|
||||
To add a bucket type to the CRUSH map, create a new line under your list of
|
||||
bucket types. Enter ``type`` followed by a unique numeric ID and a bucket name.
|
||||
By convention, there is one leaf bucket and it is ``type 0``; however, you may
|
||||
give it any name you like (e.g., osd, disk, drive, storage, etc.)::
|
||||
give it any name you like (e.g., osd, disk, drive, storage)::
|
||||
|
||||
# types
|
||||
type {num} {bucket-name}
|
||||
@ -199,8 +211,8 @@ distribution units, pods, rows, rooms, and data centers. With the exception of
|
||||
the leaf nodes representing OSDs, the rest of the hierarchy is arbitrary, and
|
||||
you may define it according to your own needs.
|
||||
|
||||
We recommend adapting your CRUSH map to your firms's hardware naming conventions
|
||||
and using instances names that reflect the physical hardware. Your naming
|
||||
We recommend adapting your CRUSH map to your firm's hardware naming conventions
|
||||
and using instance names that reflect the physical hardware. Your naming
|
||||
practice can make it easier to administer the cluster and troubleshoot
|
||||
problems when an OSD and/or other hardware malfunctions and the administrator
|
||||
need access to physical hardware.
|
||||
@ -651,27 +663,36 @@ There are three types of transformations possible:
|
||||
single bucket. For example, in the previous example, we want the
|
||||
``ssd`` bucket to be mapped to the ``default`` bucket.
|
||||
|
||||
The final command to convert the map comprised of the above fragments would be something like::
|
||||
The final command to convert the map comprising the above fragments would be something like:
|
||||
|
||||
$ ceph osd getcrushmap -o original
|
||||
$ crushtool -i original --reclassify \
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd getcrushmap -o original
|
||||
crushtool -i original --reclassify \
|
||||
--set-subtree-class default hdd \
|
||||
--reclassify-root default hdd \
|
||||
--reclassify-bucket %-ssd ssd default \
|
||||
--reclassify-bucket ssd ssd default \
|
||||
-o adjusted
|
||||
|
||||
In order to ensure that the conversion is correct, there is a ``--compare`` command that will test a large sample of inputs to the CRUSH map and ensure that the same result comes back out. These inputs are controlled by the same options that apply to the ``--test`` command. For the above example,::
|
||||
In order to ensure that the conversion is correct, there is a ``--compare`` command that will test a large sample of inputs against the CRUSH map and check that the same result is output. These inputs are controlled by the same options that apply to the ``--test`` command. For the above example,:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
crushtool -i original --compare adjusted
|
||||
|
||||
::
|
||||
|
||||
$ crushtool -i original --compare adjusted
|
||||
rule 0 had 0/10240 mismatched mappings (0)
|
||||
rule 1 had 0/10240 mismatched mappings (0)
|
||||
maps appear equivalent
|
||||
|
||||
If there were difference, you'd see what ratio of inputs are remapped
|
||||
in the parentheses.
|
||||
If there were differences, the ratio of remapped inputs would be reported in
|
||||
the parentheses.
|
||||
|
||||
If you are satisfied with the adjusted map, you can apply it to the cluster with something like::
|
||||
When you are satisfied with the adjusted map, apply it to the cluster with a command of the form:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd setcrushmap -i adjusted
|
||||
|
||||
@ -682,7 +703,9 @@ If you can ensure that all clients are running recent code, you can
|
||||
adjust the tunables by extracting the CRUSH map, modifying the values,
|
||||
and reinjecting it into the cluster.
|
||||
|
||||
* Extract the latest CRUSH map::
|
||||
* Extract the latest CRUSH map:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd getcrushmap -o /tmp/crush
|
||||
|
||||
@ -690,11 +713,15 @@ and reinjecting it into the cluster.
|
||||
for both large and small clusters we tested with. You will need to
|
||||
additionally specify the ``--enable-unsafe-tunables`` argument to
|
||||
``crushtool`` for this to work. Please use this option with
|
||||
extreme care.::
|
||||
extreme care.:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
crushtool -i /tmp/crush --set-choose-local-tries 0 --set-choose-local-fallback-tries 0 --set-choose-total-tries 50 -o /tmp/crush.new
|
||||
|
||||
* Reinject modified map::
|
||||
* Reinject modified map:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd setcrushmap -i /tmp/crush.new
|
||||
|
||||
@ -702,7 +729,9 @@ Legacy values
|
||||
-------------
|
||||
|
||||
For reference, the legacy values for the CRUSH tunables can be set
|
||||
with::
|
||||
with:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
crushtool -i /tmp/crush --set-choose-local-tries 2 --set-choose-local-fallback-tries 5 --set-choose-total-tries 19 --set-chooseleaf-descend-once 0 --set-chooseleaf-vary-r 0 -o /tmp/crush.legacy
|
||||
|
||||
@ -711,4 +740,4 @@ Further, as noted above, be careful running old versions of the
|
||||
``ceph-osd`` daemon after reverting to legacy values as the feature
|
||||
bit is not perfectly enforced.
|
||||
|
||||
.. _CRUSH - Controlled, Scalable, Decentralized Placement of Replicated Data: https://ceph.com/wp-content/uploads/2016/08/weil-crush-sc06.pdf
|
||||
.. _CRUSH - Controlled, Scalable, Decentralized Placement of Replicated Data: https://ceph.io/assets/pdfs/weil-crush-sc06.pdf
|
||||
|
@ -184,7 +184,9 @@ will be the total of all devices contained beneath it. Normally
|
||||
weights are in units of terabytes (TB).
|
||||
|
||||
You can get a simple view the of CRUSH hierarchy for your cluster,
|
||||
including weights, with::
|
||||
including weights, with:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd tree
|
||||
|
||||
@ -208,11 +210,15 @@ erasure coded), the *failure domain*, and optionally a *device class*.
|
||||
In rare cases rules must be written by hand by manually editing the
|
||||
CRUSH map.
|
||||
|
||||
You can see what rules are defined for your cluster with::
|
||||
You can see what rules are defined for your cluster with:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd crush rule ls
|
||||
|
||||
You can view the contents of the rules with::
|
||||
You can view the contents of the rules with:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd crush rule dump
|
||||
|
||||
@ -224,23 +230,31 @@ default, OSDs automatically set their class at startup to
|
||||
`hdd`, `ssd`, or `nvme` based on the type of device they are backed
|
||||
by.
|
||||
|
||||
The device class for one or more OSDs can be explicitly set with::
|
||||
The device class for one or more OSDs can be explicitly set with:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd crush set-device-class <class> <osd-name> [...]
|
||||
|
||||
Once a device class is set, it cannot be changed to another class
|
||||
until the old class is unset with::
|
||||
until the old class is unset with:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd crush rm-device-class <osd-name> [...]
|
||||
|
||||
This allows administrators to set device classes without the class
|
||||
being changed on OSD restart or by some other script.
|
||||
|
||||
A placement rule that targets a specific device class can be created with::
|
||||
A placement rule that targets a specific device class can be created with:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd crush rule create-replicated <rule-name> <root> <failure-domain> <class>
|
||||
|
||||
A pool can then be changed to use the new rule with::
|
||||
A pool can then be changed to use the new rule with:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd pool set <pool-name> crush_rule <rule-name>
|
||||
|
||||
@ -249,7 +263,9 @@ for each device class in use that contains only devices of that class.
|
||||
CRUSH rules can then distribute data over the shadow hierarchy.
|
||||
This approach is fully backward compatible with
|
||||
old Ceph clients. You can view the CRUSH hierarchy with shadow items
|
||||
with::
|
||||
with:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd crush tree --show-shadow
|
||||
|
||||
@ -295,7 +311,9 @@ There are two types of weight sets supported:
|
||||
|
||||
When weight sets are in use, the weights associated with each node in
|
||||
the hierarchy is visible as a separate column (labeled either
|
||||
``(compat)`` or the pool name) from the command::
|
||||
``(compat)`` or the pool name) from the command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd tree
|
||||
|
||||
@ -320,7 +338,9 @@ Add/Move an OSD
|
||||
.. note: OSDs are normally automatically added to the CRUSH map when
|
||||
the OSD is created. This command is rarely needed.
|
||||
|
||||
To add or move an OSD in the CRUSH map of a running cluster::
|
||||
To add or move an OSD in the CRUSH map of a running cluster:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd crush set {name} {weight} root={root} [{bucket-type}={bucket-name} ...]
|
||||
|
||||
@ -359,7 +379,9 @@ Where:
|
||||
|
||||
|
||||
The following example adds ``osd.0`` to the hierarchy, or moves the
|
||||
OSD from a previous location. ::
|
||||
OSD from a previous location:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd crush set osd.0 1.0 root=default datacenter=dc1 room=room1 row=foo rack=bar host=foo-bar-1
|
||||
|
||||
@ -372,7 +394,9 @@ Adjust OSD weight
|
||||
is rarely needed.
|
||||
|
||||
To adjust an OSD's CRUSH weight in the CRUSH map of a running cluster, execute
|
||||
the following::
|
||||
the following:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd crush reweight {name} {weight}
|
||||
|
||||
@ -403,7 +427,9 @@ Remove an OSD
|
||||
``ceph osd purge`` command. This command is rarely needed.
|
||||
|
||||
To remove an OSD from the CRUSH map of a running cluster, execute the
|
||||
following::
|
||||
following:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd crush remove {name}
|
||||
|
||||
@ -431,7 +457,9 @@ Add a Bucket
|
||||
``default`` or other root as described below.
|
||||
|
||||
To add a bucket in the CRUSH map of a running cluster, execute the
|
||||
``ceph osd crush add-bucket`` command::
|
||||
``ceph osd crush add-bucket`` command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd crush add-bucket {bucket-name} {bucket-type}
|
||||
|
||||
@ -453,7 +481,9 @@ Where:
|
||||
:Example: ``rack``
|
||||
|
||||
|
||||
The following example adds the ``rack12`` bucket to the hierarchy::
|
||||
The following example adds the ``rack12`` bucket to the hierarchy:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd crush add-bucket rack12 rack
|
||||
|
||||
@ -461,7 +491,9 @@ Move a Bucket
|
||||
-------------
|
||||
|
||||
To move a bucket to a different location or position in the CRUSH map
|
||||
hierarchy, execute the following::
|
||||
hierarchy, execute the following:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd crush move {bucket-name} {bucket-type}={bucket-name}, [...]
|
||||
|
||||
@ -484,7 +516,9 @@ Where:
|
||||
Remove a Bucket
|
||||
---------------
|
||||
|
||||
To remove a bucket from the CRUSH hierarchy, execute the following::
|
||||
To remove a bucket from the CRUSH hierarchy, execute the following:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd crush remove {bucket-name}
|
||||
|
||||
@ -499,7 +533,9 @@ Where:
|
||||
:Required: Yes
|
||||
:Example: ``rack12``
|
||||
|
||||
The following example removes the ``rack12`` bucket from the hierarchy::
|
||||
The following example removes the ``rack12`` bucket from the hierarchy:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd crush remove rack12
|
||||
|
||||
@ -509,22 +545,30 @@ Creating a compat weight set
|
||||
.. note: This step is normally done automatically by the ``balancer``
|
||||
module when enabled.
|
||||
|
||||
To create a *compat* weight set::
|
||||
To create a *compat* weight set:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd crush weight-set create-compat
|
||||
|
||||
Weights for the compat weight set can be adjusted with::
|
||||
Weights for the compat weight set can be adjusted with:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd crush weight-set reweight-compat {name} {weight}
|
||||
|
||||
The compat weight set can be destroyed with::
|
||||
The compat weight set can be destroyed with:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd crush weight-set rm-compat
|
||||
|
||||
Creating per-pool weight sets
|
||||
-----------------------------
|
||||
|
||||
To create a weight set for a specific pool,::
|
||||
To create a weight set for a specific pool:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd crush weight-set create {pool-name} {mode}
|
||||
|
||||
@ -553,15 +597,21 @@ Where:
|
||||
:Required: Yes
|
||||
:Example: ``flat``
|
||||
|
||||
To adjust the weight of an item in a weight set::
|
||||
To adjust the weight of an item in a weight set:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd crush weight-set reweight {pool-name} {item-name} {weight [...]}
|
||||
|
||||
To list existing weight sets,::
|
||||
To list existing weight sets:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd crush weight-set ls
|
||||
|
||||
To remove a weight set,::
|
||||
To remove a weight set:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd crush weight-set rm {pool-name}
|
||||
|
||||
@ -588,7 +638,9 @@ classify themselves as either ``hdd`` or ``ssd``, depending on the
|
||||
underlying type of device being used. These classes can also be
|
||||
customized.
|
||||
|
||||
To create a replicated rule,::
|
||||
To create a replicated rule:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd crush rule create-replicated {name} {root} {failure-domain-type} [{class}]
|
||||
|
||||
@ -635,11 +687,15 @@ you must include this information in the *erasure code profile*. A CRUSH
|
||||
rule will then be created from that either explicitly or automatically when
|
||||
the profile is used to create a pool.
|
||||
|
||||
The erasure code profiles can be listed with::
|
||||
The erasure code profiles can be listed with:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd erasure-code-profile ls
|
||||
|
||||
An existing profile can be viewed with::
|
||||
An existing profile can be viewed with:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd erasure-code-profile get {profile-name}
|
||||
|
||||
@ -659,7 +715,9 @@ The erasure code profile properties of interest are:
|
||||
* **crush-device-class**: the device class on which to place data [default: none, meaning all devices are used].
|
||||
* **k** and **m** (and, for the ``lrc`` plugin, **l**): these determine the number of erasure code shards, affecting the resulting CRUSH rule.
|
||||
|
||||
Once a profile is defined, you can create a CRUSH rule with::
|
||||
Once a profile is defined, you can create a CRUSH rule with:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd crush rule create-erasure {name} {profile-name}
|
||||
|
||||
@ -671,7 +729,9 @@ Once a profile is defined, you can create a CRUSH rule with::
|
||||
Deleting rules
|
||||
--------------
|
||||
|
||||
Rules that are not in use by pools can be deleted with::
|
||||
Rules that are not in use by pools can be deleted with:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd crush rule rm {rule-name}
|
||||
|
||||
@ -882,14 +942,18 @@ To make this warning go away, you have two options:
|
||||
result in some data movement (possibly as much as 10%). This is the
|
||||
preferred route, but should be taken with care on a production cluster
|
||||
where the data movement may affect performance. You can enable optimal
|
||||
tunables with::
|
||||
tunables with:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd crush tunables optimal
|
||||
|
||||
If things go poorly (e.g., too much load) and not very much
|
||||
progress has been made, or there is a client compatibility problem
|
||||
(old kernel CephFS or RBD clients, or pre-Bobtail ``librados``
|
||||
clients), you can switch back with::
|
||||
clients), you can switch back with:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd crush tunables legacy
|
||||
|
||||
@ -899,7 +963,9 @@ To make this warning go away, you have two options:
|
||||
mon warn on legacy crush tunables = false
|
||||
|
||||
For the change to take effect, you will need to restart the monitors, or
|
||||
apply the option to running monitors with::
|
||||
apply the option to running monitors with:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph tell mon.\* config set mon_warn_on_legacy_crush_tunables false
|
||||
|
||||
@ -936,7 +1002,7 @@ sets known as *profiles*. As of the Octopus release these are:
|
||||
* ``firefly``: the values supported by the firefly release
|
||||
* ``hammer``: the values supported by the hammer release
|
||||
* ``jewel``: the values supported by the jewel release
|
||||
* ``optimal``: the best (ie optimal) values of the current version of Ceph
|
||||
* ``optimal``: the best (i.e. optimal) values of the current version of Ceph
|
||||
* ``default``: the default values of a new cluster installed from
|
||||
scratch. These values, which depend on the current version of Ceph,
|
||||
are hardcoded and are generally a mix of optimal and legacy values.
|
||||
@ -944,7 +1010,9 @@ sets known as *profiles*. As of the Octopus release these are:
|
||||
LTS release, or the most recent release for which we generally expect
|
||||
most users to have up-to-date clients for.
|
||||
|
||||
You can apply a profile to a running cluster with the command::
|
||||
You can apply a profile to a running cluster with the command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd crush tunables {PROFILE}
|
||||
|
||||
@ -953,8 +1021,7 @@ release notes and documentation carefully before changing the profile on a
|
||||
running cluster, and consider throttling recovery/backfill parameters to
|
||||
limit the impact of a bolus of backfill.
|
||||
|
||||
|
||||
.. _CRUSH - Controlled, Scalable, Decentralized Placement of Replicated Data: https://ceph.com/wp-content/uploads/2016/08/weil-crush-sc06.pdf
|
||||
.. _CRUSH - Controlled, Scalable, Decentralized Placement of Replicated Data: https://ceph.io/assets/pdfs/weil-crush-sc06.pdf
|
||||
|
||||
|
||||
Primary Affinity
|
||||
@ -987,19 +1054,20 @@ interface bandwidth and CPU cycles more evenly.
|
||||
By default, all ceph OSDs have primary affinity of ``1``, which indicates that
|
||||
any OSD may act as a primary with equal probability.
|
||||
|
||||
You can reduce a Ceph OSD's primary affinity so that CRUSH is less likely to choose
|
||||
the OSD as primary in a PG's acting set.::
|
||||
You can reduce a Ceph OSD's primary affinity so that CRUSH is less likely to
|
||||
choose the OSD as primary in a PG's acting set.:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd primary-affinity <osd-id> <weight>
|
||||
|
||||
You may set an OSD's primary affinity to a real number in the range
|
||||
``[0-1]``, where ``0`` indicates that the OSD may **NOT** be used as a primary
|
||||
and ``1`` indicates that an OSD may be used as a primary. When the weight is
|
||||
between these extremes, it is less likely that
|
||||
CRUSH will select that OSD as a primary. The process for
|
||||
selecting the lead OSD is more nuanced than a simple probability based on
|
||||
relative affinity values, but measurable results can be achieved even with
|
||||
first-order approximations of desirable values.
|
||||
You may set an OSD's primary affinity to a real number in the range ``[0-1]``,
|
||||
where ``0`` indicates that the OSD may **NOT** be used as a primary and ``1``
|
||||
indicates that an OSD may be used as a primary. When the weight is between
|
||||
these extremes, it is less likely that CRUSH will select that OSD as a primary.
|
||||
The process for selecting the lead OSD is more nuanced than a simple
|
||||
probability based on relative affinity values, but measurable results can be
|
||||
achieved even with first-order approximations of desirable values.
|
||||
|
||||
Custom CRUSH Rules
|
||||
------------------
|
||||
@ -1052,7 +1120,6 @@ must not contain the same servers::
|
||||
}
|
||||
|
||||
|
||||
|
||||
Note also that on failure of an SSD, requests to a PG will be served temporarily
|
||||
from a (slower) HDD OSD until the PG's data has been replicated onto the replacement
|
||||
primary SSD OSD.
|
||||
|
@ -1,4 +1,3 @@
|
||||
|
||||
.. _devices:
|
||||
|
||||
Device Management
|
||||
@ -11,17 +10,23 @@ provide tools to predict and/or automatically respond to hardware failure.
|
||||
Device tracking
|
||||
---------------
|
||||
|
||||
You can query which storage devices are in use with::
|
||||
You can query which storage devices are in use with:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph device ls
|
||||
|
||||
You can also list devices by daemon or by host::
|
||||
You can also list devices by daemon or by host:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph device ls-by-daemon <daemon>
|
||||
ceph device ls-by-host <host>
|
||||
|
||||
For any individual device, you can query information about its
|
||||
location and how it is being consumed with::
|
||||
location and how it is being consumed with:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph device info <devid>
|
||||
|
||||
@ -34,7 +39,9 @@ failed disks easy and less error-prone. Use the following command::
|
||||
device light on|off <devid> [ident|fault] [--force]
|
||||
|
||||
The ``<devid>`` parameter is the device identification. You can obtain this
|
||||
information using the following command::
|
||||
information using the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph device ls
|
||||
|
||||
@ -43,7 +50,9 @@ By default, the `identification` light is used.
|
||||
|
||||
.. note::
|
||||
This command needs the Cephadm or the Rook `orchestrator <https://docs.ceph.com/docs/master/mgr/orchestrator/#orchestrator-cli-module>`_ module enabled.
|
||||
The orchestrator module enabled is shown by executing the following command::
|
||||
The orchestrator module enabled is shown by executing the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph orch status
|
||||
|
||||
@ -77,11 +86,15 @@ or unrecoverable read errors. Other device types like SAS and NVMe
|
||||
implement a similar set of metrics (via slightly different standards).
|
||||
All of these can be collected by Ceph via the ``smartctl`` tool.
|
||||
|
||||
You can enable or disable health monitoring with::
|
||||
You can enable or disable health monitoring with:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph device monitoring on
|
||||
|
||||
or::
|
||||
or:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph device monitoring off
|
||||
|
||||
@ -89,26 +102,36 @@ or::
|
||||
Scraping
|
||||
--------
|
||||
|
||||
If monitoring is enabled, metrics will automatically be scraped at regular intervals. That interval can be configured with::
|
||||
If monitoring is enabled, metrics will automatically be scraped at regular intervals. That interval can be configured with:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph config set mgr mgr/devicehealth/scrape_frequency <seconds>
|
||||
|
||||
The default is to scrape once every 24 hours.
|
||||
|
||||
You can manually trigger a scrape of all devices with::
|
||||
You can manually trigger a scrape of all devices with:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph device scrape-health-metrics
|
||||
|
||||
A single device can be scraped with::
|
||||
A single device can be scraped with:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph device scrape-health-metrics <device-id>
|
||||
|
||||
Or a single daemon's devices can be scraped with::
|
||||
Or a single daemon's devices can be scraped with:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph device scrape-daemon-health-metrics <who>
|
||||
|
||||
The stored health metrics for a device can be retrieved (optionally
|
||||
for a specific timestamp) with::
|
||||
for a specific timestamp) with:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph device get-health-metrics <devid> [sample-timestamp]
|
||||
|
||||
@ -121,27 +144,37 @@ health metrics it collects. There are three modes:
|
||||
* *none*: disable device failure prediction.
|
||||
* *local*: use a pre-trained prediction model from the ceph-mgr daemon
|
||||
|
||||
The prediction mode can be configured with::
|
||||
The prediction mode can be configured with:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph config set global device_failure_prediction_mode <mode>
|
||||
|
||||
Prediction normally runs in the background on a periodic basis, so it
|
||||
may take some time before life expectancy values are populated. You
|
||||
can see the life expectancy of all devices in output from::
|
||||
can see the life expectancy of all devices in output from:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph device ls
|
||||
|
||||
You can also query the metadata for a specific device with::
|
||||
You can also query the metadata for a specific device with:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph device info <devid>
|
||||
|
||||
You can explicitly force prediction of a device's life expectancy with::
|
||||
You can explicitly force prediction of a device's life expectancy with:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph device predict-life-expectancy <devid>
|
||||
|
||||
If you are not using Ceph's internal device failure prediction but
|
||||
have some external source of information about device failures, you
|
||||
can inform Ceph of a device's life expectancy with::
|
||||
can inform Ceph of a device's life expectancy with:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph device set-life-expectancy <devid> <from> [<to>]
|
||||
|
||||
@ -156,7 +189,9 @@ The ``mgr/devicehealth/warn_threshold`` controls how soon an expected
|
||||
device failure must be before we generate a health warning.
|
||||
|
||||
The stored life expectancy of all devices can be checked, and any
|
||||
appropriate health alerts generated, with::
|
||||
appropriate health alerts generated, with:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph device check-health
|
||||
|
||||
|
@ -38,19 +38,23 @@ to achieve recovery from an OSD failure.
|
||||
Erasure-code profile examples
|
||||
=============================
|
||||
|
||||
An example configuration that can be used to observe reduced bandwidth usage::
|
||||
An example configuration that can be used to observe reduced bandwidth usage:
|
||||
|
||||
$ ceph osd erasure-code-profile set CLAYprofile \
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd erasure-code-profile set CLAYprofile \
|
||||
plugin=clay \
|
||||
k=4 m=2 d=5 \
|
||||
crush-failure-domain=host
|
||||
$ ceph osd pool create claypool erasure CLAYprofile
|
||||
ceph osd pool create claypool erasure CLAYprofile
|
||||
|
||||
|
||||
Creating a clay profile
|
||||
=======================
|
||||
|
||||
To create a new clay code profile::
|
||||
To create a new clay code profile:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd erasure-code-profile set {name} \
|
||||
plugin=clay \
|
||||
|
@ -9,7 +9,9 @@ library.
|
||||
Create an isa profile
|
||||
=====================
|
||||
|
||||
To create a new *isa* erasure code profile::
|
||||
To create a new *isa* erasure code profile:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd erasure-code-profile set {name} \
|
||||
plugin=isa \
|
||||
|
@ -13,7 +13,9 @@ understanding of the parameters.
|
||||
Create a jerasure profile
|
||||
=========================
|
||||
|
||||
To create a new *jerasure* erasure code profile::
|
||||
To create a new *jerasure* erasure code profile:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd erasure-code-profile set {name} \
|
||||
plugin=jerasure \
|
||||
|
@ -22,33 +22,39 @@ Reduce recovery bandwidth between hosts
|
||||
|
||||
Although it is probably not an interesting use case when all hosts are
|
||||
connected to the same switch, reduced bandwidth usage can actually be
|
||||
observed.::
|
||||
observed.:
|
||||
|
||||
$ ceph osd erasure-code-profile set LRCprofile \
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd erasure-code-profile set LRCprofile \
|
||||
plugin=lrc \
|
||||
k=4 m=2 l=3 \
|
||||
crush-failure-domain=host
|
||||
$ ceph osd pool create lrcpool erasure LRCprofile
|
||||
ceph osd pool create lrcpool erasure LRCprofile
|
||||
|
||||
|
||||
Reduce recovery bandwidth between racks
|
||||
---------------------------------------
|
||||
|
||||
In Firefly the bandwidth reduction will only be observed if the primary
|
||||
OSD is in the same rack as the lost chunk.::
|
||||
OSD is in the same rack as the lost chunk.:
|
||||
|
||||
$ ceph osd erasure-code-profile set LRCprofile \
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd erasure-code-profile set LRCprofile \
|
||||
plugin=lrc \
|
||||
k=4 m=2 l=3 \
|
||||
crush-locality=rack \
|
||||
crush-failure-domain=host
|
||||
$ ceph osd pool create lrcpool erasure LRCprofile
|
||||
ceph osd pool create lrcpool erasure LRCprofile
|
||||
|
||||
|
||||
Create an lrc profile
|
||||
=====================
|
||||
|
||||
To create a new lrc erasure code profile::
|
||||
To create a new lrc erasure code profile:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd erasure-code-profile set {name} \
|
||||
plugin=lrc \
|
||||
@ -190,13 +196,15 @@ Minimal testing
|
||||
|
||||
It is strictly equivalent to using a *K=2* *M=1* erasure code profile. The *DD*
|
||||
implies *K=2*, the *c* implies *M=1* and the *jerasure* plugin is used
|
||||
by default.::
|
||||
by default.:
|
||||
|
||||
$ ceph osd erasure-code-profile set LRCprofile \
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd erasure-code-profile set LRCprofile \
|
||||
plugin=lrc \
|
||||
mapping=DD_ \
|
||||
layers='[ [ "DDc", "" ] ]'
|
||||
$ ceph osd pool create lrcpool erasure LRCprofile
|
||||
ceph osd pool create lrcpool erasure LRCprofile
|
||||
|
||||
Reduce recovery bandwidth between hosts
|
||||
---------------------------------------
|
||||
@ -204,7 +212,9 @@ Reduce recovery bandwidth between hosts
|
||||
Although it is probably not an interesting use case when all hosts are
|
||||
connected to the same switch, reduced bandwidth usage can actually be
|
||||
observed. It is equivalent to **k=4**, **m=2** and **l=3** although
|
||||
the layout of the chunks is different::
|
||||
the layout of the chunks is different. **WARNING: PROMPTS ARE SELECTABLE**
|
||||
|
||||
::
|
||||
|
||||
$ ceph osd erasure-code-profile set LRCprofile \
|
||||
plugin=lrc \
|
||||
@ -220,8 +230,10 @@ the layout of the chunks is different::
|
||||
Reduce recovery bandwidth between racks
|
||||
---------------------------------------
|
||||
|
||||
In Firefly the reduced bandwidth will only be observed if the primary
|
||||
OSD is in the same rack as the lost chunk.::
|
||||
In Firefly the reduced bandwidth will only be observed if the primary OSD is in
|
||||
the same rack as the lost chunk. **WARNING: PROMPTS ARE SELECTABLE**
|
||||
|
||||
::
|
||||
|
||||
$ ceph osd erasure-code-profile set LRCprofile \
|
||||
plugin=lrc \
|
||||
@ -235,6 +247,7 @@ OSD is in the same rack as the lost chunk.::
|
||||
[ "choose", "rack", 2 ],
|
||||
[ "chooseleaf", "host", 4 ],
|
||||
]'
|
||||
|
||||
$ ceph osd pool create lrcpool erasure LRCprofile
|
||||
|
||||
Testing with different Erasure Code backends
|
||||
@ -245,16 +258,20 @@ specify the EC backend/algorithm on a per layer basis using the low
|
||||
level configuration. The second argument in layers='[ [ "DDc", "" ] ]'
|
||||
is actually an erasure code profile to be used for this level. The
|
||||
example below specifies the ISA backend with the cauchy technique to
|
||||
be used in the lrcpool.::
|
||||
be used in the lrcpool.:
|
||||
|
||||
$ ceph osd erasure-code-profile set LRCprofile \
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd erasure-code-profile set LRCprofile \
|
||||
plugin=lrc \
|
||||
mapping=DD_ \
|
||||
layers='[ [ "DDc", "plugin=isa technique=cauchy" ] ]'
|
||||
$ ceph osd pool create lrcpool erasure LRCprofile
|
||||
ceph osd pool create lrcpool erasure LRCprofile
|
||||
|
||||
You could also use a different erasure code profile for for each
|
||||
layer.::
|
||||
You could also use a different erasure code profile for each
|
||||
layer. **WARNING: PROMPTS ARE SELECTABLE**
|
||||
|
||||
::
|
||||
|
||||
$ ceph osd erasure-code-profile set LRCprofile \
|
||||
plugin=lrc \
|
||||
|
@ -9,7 +9,9 @@ library. It allows ceph to recover data more efficiently than Reed Solomon codes
|
||||
Create an SHEC profile
|
||||
======================
|
||||
|
||||
To create a new *shec* erasure code profile::
|
||||
To create a new *shec* erasure code profile:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd erasure-code-profile set {name} \
|
||||
plugin=shec \
|
||||
@ -133,10 +135,11 @@ but at least increasing m without increasing c achieves improvement of recovery
|
||||
Erasure code profile examples
|
||||
=============================
|
||||
|
||||
::
|
||||
|
||||
$ ceph osd erasure-code-profile set SHECprofile \
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd erasure-code-profile set SHECprofile \
|
||||
plugin=shec \
|
||||
k=8 m=4 c=3 \
|
||||
crush-failure-domain=host
|
||||
$ ceph osd pool create shecpool erasure SHECprofile
|
||||
ceph osd pool create shecpool erasure SHECprofile
|
||||
|
@ -4,58 +4,104 @@
|
||||
Erasure code
|
||||
=============
|
||||
|
||||
A Ceph pool is associated to a type to sustain the loss of an OSD
|
||||
(i.e. a disk since most of the time there is one OSD per disk). The
|
||||
default choice when `creating a pool <../pools>`_ is *replicated*,
|
||||
meaning every object is copied on multiple disks. The `Erasure Code
|
||||
<https://en.wikipedia.org/wiki/Erasure_code>`_ pool type can be used
|
||||
instead to save space.
|
||||
By default, Ceph `pools <../pools>`_ are created with the type "replicated". In
|
||||
replicated-type pools, every object is copied to multiple disks (this
|
||||
multiple copying is the "replication").
|
||||
|
||||
In contrast, `erasure-coded <https://en.wikipedia.org/wiki/Erasure_code>`_
|
||||
pools use a method of data protection that is different from replication. In
|
||||
erasure coding, data is broken into fragments of two kinds: data blocks and
|
||||
parity blocks. If a drive fails or becomes corrupted, the parity blocks are
|
||||
used to rebuild the data. At scale, erasure coding saves space relative to
|
||||
replication.
|
||||
|
||||
In this documentation, data blocks are referred to as "data chunks"
|
||||
and parity blocks are referred to as "encoding chunks".
|
||||
|
||||
Erasure codes are also called "forward error correction codes". The
|
||||
first forward error correction code was developed in 1950 by Richard
|
||||
Hamming at Bell Laboratories.
|
||||
|
||||
|
||||
Creating a sample erasure coded pool
|
||||
------------------------------------
|
||||
|
||||
The simplest erasure coded pool is equivalent to `RAID5
|
||||
<https://en.wikipedia.org/wiki/Standard_RAID_levels#RAID_5>`_ and
|
||||
requires at least three hosts::
|
||||
requires at least three hosts:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd pool create ecpool erasure
|
||||
|
||||
::
|
||||
|
||||
$ ceph osd pool create ecpool erasure
|
||||
pool 'ecpool' created
|
||||
$ echo ABCDEFGHI | rados --pool ecpool put NYAN -
|
||||
$ rados --pool ecpool get NYAN -
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
echo ABCDEFGHI | rados --pool ecpool put NYAN -
|
||||
rados --pool ecpool get NYAN -
|
||||
|
||||
::
|
||||
|
||||
ABCDEFGHI
|
||||
|
||||
Erasure code profiles
|
||||
---------------------
|
||||
|
||||
The default erasure code profile sustains the loss of a two OSDs. It
|
||||
is equivalent to a replicated pool of size three but requires 2TB
|
||||
instead of 3TB to store 1TB of data. The default profile can be
|
||||
displayed with::
|
||||
The default erasure code profile can sustain the loss of two OSDs. This erasure
|
||||
code profile is equivalent to a replicated pool of size three, but requires
|
||||
2TB to store 1TB of data instead of 3TB to store 1TB of data. The default
|
||||
profile can be displayed with this command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd erasure-code-profile get default
|
||||
|
||||
::
|
||||
|
||||
$ ceph osd erasure-code-profile get default
|
||||
k=2
|
||||
m=2
|
||||
plugin=jerasure
|
||||
crush-failure-domain=host
|
||||
technique=reed_sol_van
|
||||
|
||||
Choosing the right profile is important because it cannot be modified
|
||||
after the pool is created: a new pool with a different profile needs
|
||||
to be created and all objects from the previous pool moved to the new.
|
||||
.. note::
|
||||
The default erasure-coded pool, the profile of which is displayed here, is
|
||||
not the same as the simplest erasure-coded pool.
|
||||
|
||||
The default erasure-coded pool has two data chunks (k) and two coding chunks
|
||||
(m). The profile of the default erasure-coded pool is "k=2 m=2".
|
||||
|
||||
The simplest erasure-coded pool has two data chunks (k) and one coding chunk
|
||||
(m). The profile of the simplest erasure-coded pool is "k=2 m=1".
|
||||
|
||||
Choosing the right profile is important because the profile cannot be modified
|
||||
after the pool is created. If you find that you need an erasure-coded pool with
|
||||
a profile different than the one you have created, you must create a new pool
|
||||
with a different (and presumably more carefully-considered) profile. When the
|
||||
new pool is created, all objects from the wrongly-configured pool must be moved
|
||||
to the newly-created pool. There is no way to alter the profile of a pool after its creation.
|
||||
|
||||
The most important parameters of the profile are *K*, *M* and
|
||||
*crush-failure-domain* because they define the storage overhead and
|
||||
the data durability. For instance, if the desired architecture must
|
||||
the data durability. For example, if the desired architecture must
|
||||
sustain the loss of two racks with a storage overhead of 67% overhead,
|
||||
the following profile can be defined::
|
||||
the following profile can be defined:
|
||||
|
||||
$ ceph osd erasure-code-profile set myprofile \
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd erasure-code-profile set myprofile \
|
||||
k=3 \
|
||||
m=2 \
|
||||
crush-failure-domain=rack
|
||||
$ ceph osd pool create ecpool erasure myprofile
|
||||
$ echo ABCDEFGHI | rados --pool ecpool put NYAN -
|
||||
$ rados --pool ecpool get NYAN -
|
||||
ceph osd pool create ecpool erasure myprofile
|
||||
echo ABCDEFGHI | rados --pool ecpool put NYAN -
|
||||
rados --pool ecpool get NYAN -
|
||||
|
||||
::
|
||||
|
||||
ABCDEFGHI
|
||||
|
||||
The *NYAN* object will be divided in three (*K=3*) and two additional
|
||||
@ -121,19 +167,23 @@ perform full object writes and appends.
|
||||
|
||||
Since Luminous, partial writes for an erasure coded pool may be
|
||||
enabled with a per-pool setting. This lets RBD and CephFS store their
|
||||
data in an erasure coded pool::
|
||||
data in an erasure coded pool:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd pool set ec_pool allow_ec_overwrites true
|
||||
|
||||
This can only be enabled on a pool residing on bluestore OSDs, since
|
||||
bluestore's checksumming is used to detect bitrot or other corruption
|
||||
during deep-scrub. In addition to being unsafe, using filestore with
|
||||
ec overwrites yields low performance compared to bluestore.
|
||||
This can be enabled only on a pool residing on BlueStore OSDs, since
|
||||
BlueStore's checksumming is used during deep scrubs to detect bitrot
|
||||
or other corruption. In addition to being unsafe, using Filestore with
|
||||
EC overwrites results in lower performance compared to BlueStore.
|
||||
|
||||
Erasure coded pools do not support omap, so to use them with RBD and
|
||||
CephFS you must instruct them to store their data in an ec pool, and
|
||||
CephFS you must instruct them to store their data in an EC pool, and
|
||||
their metadata in a replicated pool. For RBD, this means using the
|
||||
erasure coded pool as the ``--data-pool`` during image creation::
|
||||
erasure coded pool as the ``--data-pool`` during image creation:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
rbd create --size 1G --data-pool ec_pool replicated_pool/image_name
|
||||
|
||||
@ -145,37 +195,41 @@ Erasure coded pool and cache tiering
|
||||
------------------------------------
|
||||
|
||||
Erasure coded pools require more resources than replicated pools and
|
||||
lack some functionalities such as omap. To overcome these
|
||||
lack some functionality such as omap. To overcome these
|
||||
limitations, one can set up a `cache tier <../cache-tiering>`_
|
||||
before the erasure coded pool.
|
||||
|
||||
For instance, if the pool *hot-storage* is made of fast storage::
|
||||
For instance, if the pool *hot-storage* is made of fast storage:
|
||||
|
||||
$ ceph osd tier add ecpool hot-storage
|
||||
$ ceph osd tier cache-mode hot-storage writeback
|
||||
$ ceph osd tier set-overlay ecpool hot-storage
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd tier add ecpool hot-storage
|
||||
ceph osd tier cache-mode hot-storage writeback
|
||||
ceph osd tier set-overlay ecpool hot-storage
|
||||
|
||||
will place the *hot-storage* pool as tier of *ecpool* in *writeback*
|
||||
mode so that every write and read to the *ecpool* are actually using
|
||||
the *hot-storage* and benefit from its flexibility and speed.
|
||||
|
||||
More information can be found in the `cache tiering
|
||||
<../cache-tiering>`_ documentation.
|
||||
<../cache-tiering>`_ documentation. Note however that cache tiering
|
||||
is deprecated and may be removed completely in a future release.
|
||||
|
||||
Erasure coded pool recovery
|
||||
---------------------------
|
||||
If an erasure coded pool loses some shards, it must recover them from the others.
|
||||
This generally involves reading from the remaining shards, reconstructing the data, and
|
||||
writing it to the new peer.
|
||||
In Octopus, erasure coded pools can recover as long as there are at least *K* shards
|
||||
If an erasure coded pool loses some data shards, it must recover them from others.
|
||||
This involves reading from the remaining shards, reconstructing the data, and
|
||||
writing new shards.
|
||||
In Octopus and later releases, erasure-coded pools can recover as long as there are at least *K* shards
|
||||
available. (With fewer than *K* shards, you have actually lost data!)
|
||||
|
||||
Prior to Octopus, erasure coded pools required at least *min_size* shards to be
|
||||
available, even if *min_size* is greater than *K*. (We generally recommend min_size
|
||||
be *K+2* or more to prevent loss of writes and data.)
|
||||
This conservative decision was made out of an abundance of caution when designing the new pool
|
||||
mode but also meant pools with lost OSDs but no data loss were unable to recover and go active
|
||||
without manual intervention to change the *min_size*.
|
||||
Prior to Octopus, erasure coded pools required at least ``min_size`` shards to be
|
||||
available, even if ``min_size`` is greater than ``K``. We recommend ``min_size``
|
||||
be ``K+2`` or more to prevent loss of writes and data.
|
||||
This conservative decision was made out of an abundance of caution when
|
||||
designing the new pool mode. As a result pools with lost OSDs but without
|
||||
complete loss of any data were unable to recover and go active
|
||||
without manual intervention to temporarily change the ``min_size`` setting.
|
||||
|
||||
Glossary
|
||||
--------
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user