From a2f5a7e75580f8a365fb240ea1688fb4e77a0ca4 Mon Sep 17 00:00:00 2001 From: Thomas Lamprecht Date: Thu, 31 Aug 2023 14:47:29 +0200 Subject: [PATCH] import ceph pacific 16.2.14 source Signed-off-by: Thomas Lamprecht --- ceph/CMakeLists.txt | 2 +- ceph/PendingReleaseNotes | 70 + ceph/ceph.spec | 6 +- ceph/changelog.upstream | 10 +- ceph/debian/cephfs-mirror.install | 2 + ceph/doc/cephadm/operations.rst | 35 +- ceph/doc/cephadm/services/index.rst | 52 + ceph/doc/cephadm/services/monitoring.rst | 16 +- ceph/doc/cephadm/services/osd.rst | 2 +- ceph/doc/cephfs/cephfs-mirroring.rst | 175 +- ceph/doc/cephfs/cephfs-top.rst | 12 +- ceph/doc/cephfs/disaster-recovery-experts.rst | 31 +- ceph/doc/cephfs/fs-volumes.rst | 515 +- ceph/doc/cephfs/health-messages.rst | 16 +- ceph/doc/cephfs/mds-config-ref.rst | 19 + ceph/doc/cephfs/mount-using-fuse.rst | 3 +- ceph/doc/cephfs/mount-using-kernel-driver.rst | 22 + ceph/doc/cephfs/nfs.rst | 12 + ceph/doc/cephfs/scrub.rst | 11 + ceph/doc/cephfs/snap-schedule.rst | 13 + ceph/doc/cephfs/troubleshooting.rst | 92 + ceph/doc/dev/network-encoding.rst | 3 +- .../osd_internals/erasure_coding/jerasure.rst | 4 +- ceph/doc/dev/osd_internals/past_intervals.rst | 93 + ceph/doc/glossary.rst | 13 +- ceph/doc/index.rst | 6 + ceph/doc/man/8/cephfs-top.rst | 16 + ceph/doc/man/8/mount.ceph.rst | 10 + ceph/doc/mgr/prometheus.rst | 83 +- ceph/doc/mgr/telemetry.rst | 21 + .../configuration/bluestore-config-ref.rst | 579 +- ceph/doc/rados/configuration/common.rst | 2 +- .../configuration/filestore-config-ref.rst | 161 +- .../rados/configuration/mon-config-ref.rst | 45 +- .../rados/configuration/storage-devices.rst | 1 + ceph/doc/rados/operations/balancer.rst | 163 +- .../rados/operations/bluestore-migration.rst | 2 + ceph/doc/rados/operations/cache-tiering.rst | 4 + ceph/doc/rados/operations/crush-map.rst | 2 +- ceph/doc/rados/operations/data-placement.rst | 62 +- ceph/doc/rados/operations/devices.rst | 161 +- .../operations/erasure-code-jerasure.rst | 8 +- ceph/doc/rados/operations/health-checks.rst | 2 + .../rados/operations/monitoring-osd-pg.rst | 11 +- ceph/doc/rados/operations/pg-concepts.rst | 2 + ceph/doc/rados/operations/stretch-mode.rst | 347 +- ceph/doc/rados/operations/user-management.rst | 411 +- ceph/doc/radosgw/dynamicresharding.rst | 5 +- ceph/doc/radosgw/multisite.rst | 17 +- ceph/doc/radosgw/notifications.rst | 3 - ceph/doc/start/documenting-ceph.rst | 2 + ceph/doc/start/get-involved.rst | 5 +- ceph/doc/start/intro.rst | 26 +- ceph/doc/start/os-recommendations.rst | 15 +- ceph/install-deps.sh | 106 +- ceph/qa/rgw/ignore-pg-availability.yaml | 2 + .../fs/functional/tasks/alternate-pool.yaml | 1 - .../fs/functional/tasks/client-recovery.yaml | 3 + ceph/qa/suites/fs/mirror-ha/cephfs-mirror/+ | 0 .../cephfs-mirror/1-volume-create-rm.yaml | 14 + ...-cluster.yaml => 2-three-per-cluster.yaml} | 0 .../workloads/cephfs-mirror-ha-workunit.yaml | 4 - .../multiclient/tasks/cephfs_misc_tests.yaml | 1 + ceph/qa/suites/fs/top/cluster/1-node.yaml | 4 +- .../fs/volumes/tasks/volumes/test/basic.yaml | 1 + ceph/qa/suites/fs/workload/subvolume/$ | 0 .../fs/workload/subvolume/no-subvolume.yaml | 0 .../with-namespace-isolated-and-quota.yaml | 11 + .../subvolume/with-namespace-isolated.yaml | 11 + .../subvolume/with-no-extra-options.yaml | 10 + .../fs/workload/subvolume/with-quota.yaml | 11 + .../workloads/krbd_diff_continuous.yaml | 12 + .../qa/suites/orch/cephadm/workunits/task/.qa | 1 + .../workunits/task/test_iscsi_container/+ | 0 .../workunits/task/test_iscsi_container/.qa | 1 + .../centos_8.stream_container_tools.yaml | 1 + .../test_iscsi_container.yaml} | 1 + ceph/qa/suites/rados/rook | 1 - .../singleton/all/thrash-backfill-full.yaml | 2 +- .../rados/singleton/all/thrash-eio.yaml | 4 +- .../rados/verify/tasks/rados_api_tests.yaml | 4 + .../workloads/rbd_nbd_diff_continuous.yaml | 14 + .../singleton/all/qemu-iotests-no-cache.yaml | 1 + .../all/qemu-iotests-writearound.yaml | 1 + .../singleton/all/qemu-iotests-writeback.yaml | 1 + .../all/qemu-iotests-writethrough.yaml | 1 + ...zone-plus-pubsub.yaml => three-zones.yaml} | 5 +- .../suites/rgw/verify/tasks/versioning.yaml | 5 + .../parallel/workload/rados_api.yaml | 2 + .../4-workload/rbd-cls.yaml | 2 + .../2-first-half-tasks/rbd-cls.yaml | 2 + .../stress-split/3-stress-tasks/rbd-cls.yaml | 2 + ceph/qa/tasks/ceph.py | 9 + ceph/qa/tasks/ceph_deploy.py | 6 + ceph/qa/tasks/ceph_fuse.py | 14 + ceph/qa/tasks/ceph_manager.py | 7 +- ceph/qa/tasks/ceph_test_case.py | 35 +- ceph/qa/tasks/cephadm.py | 7 +- ceph/qa/tasks/cephfs/cephfs_test_case.py | 2 +- ceph/qa/tasks/cephfs/filesystem.py | 38 +- ceph/qa/tasks/cephfs/fuse_mount.py | 3 + ceph/qa/tasks/cephfs/kernel_mount.py | 2 + ceph/qa/tasks/cephfs/mount.py | 67 +- ceph/qa/tasks/cephfs/test_client_limits.py | 2 +- ceph/qa/tasks/cephfs/test_client_recovery.py | 109 + ceph/qa/tasks/cephfs/test_data_scan.py | 84 +- ceph/qa/tasks/cephfs/test_exports.py | 63 + ceph/qa/tasks/cephfs/test_failover.py | 35 +- ceph/qa/tasks/cephfs/test_fstop.py | 101 +- ceph/qa/tasks/cephfs/test_misc.py | 43 + ceph/qa/tasks/cephfs/test_nfs.py | 184 +- ceph/qa/tasks/cephfs/test_scrub_checks.py | 44 + ceph/qa/tasks/cephfs/test_snap_schedules.py | 22 + ceph/qa/tasks/cephfs/test_snapshots.py | 10 + ceph/qa/tasks/cephfs/test_subvolume.py | 170 + ceph/qa/tasks/cephfs/test_volumes.py | 18 +- ceph/qa/tasks/mgr/dashboard/test_rbd.py | 32 +- ceph/qa/tasks/mgr/mgr_test_case.py | 6 +- ceph/qa/tasks/mon_thrash.py | 40 +- ceph/qa/tasks/qemu.py | 15 +- ceph/qa/tasks/rgw_multisite.py | 10 +- ceph/qa/tasks/rgw_multisite_tests.py | 5 +- ceph/qa/tasks/rook.py | 6 + ceph/qa/valgrind.supp | 16 + .../workunits/cephadm/test_iscsi_etc_hosts.sh | 21 + ceph/qa/workunits/cephtool/test.sh | 1 + ceph/qa/workunits/fs/misc/subvolume.sh | 63 - ceph/qa/workunits/libcephfs/test.sh | 1 + ceph/qa/workunits/mon/pg_autoscaler.sh | 10 + ceph/qa/workunits/rbd/cli_generic.sh | 178 + ceph/qa/workunits/rbd/diff_continuous.sh | 138 +- ceph/qa/workunits/rgw/common.py | 57 + ceph/qa/workunits/rgw/run-versioning.sh | 19 + ceph/qa/workunits/rgw/test_rgw_reshard.py | 61 +- ceph/qa/workunits/rgw/test_rgw_versioning.py | 110 + ceph/src/.git_version | 4 +- ceph/src/blk/kernel/KernelDevice.cc | 4 +- .../ceph_volume/devices/lvm/batch.py | 7 +- .../ceph_volume/devices/lvm/deactivate.py | 2 +- .../ceph_volume/devices/lvm/migrate.py | 44 +- .../ceph_volume/devices/lvm/prepare.py | 14 +- .../ceph_volume/devices/lvm/zap.py | 10 +- .../ceph_volume/devices/raw/prepare.py | 13 +- .../ceph_volume/drive_group/main.py | 2 +- .../tests/devices/lvm/test_deactivate.py | 2 +- .../tests/devices/lvm/test_migrate.py | 450 ++ .../ceph-volume/ceph_volume/util/device.py | 11 +- .../ceph_volume/util/encryption.py | 27 +- ceph/src/cephadm/cephadm | 128 +- ceph/src/cephadm/tests/test_cephadm.py | 77 + ceph/src/client/Client.cc | 268 +- ceph/src/client/Client.h | 8 +- ceph/src/client/Dentry.h | 1 + ceph/src/client/Inode.cc | 1 - ceph/src/client/MetaRequest.cc | 4 +- ceph/src/client/MetaRequest.h | 6 +- ceph/src/cls/cephfs/cls_cephfs.h | 5 +- ceph/src/cls/cephfs/cls_cephfs_client.cc | 35 +- ceph/src/cls/cephfs/cls_cephfs_client.h | 1 + ceph/src/cls/queue/cls_queue_src.cc | 12 +- ceph/src/cls/rbd/cls_rbd.cc | 18 - ceph/src/common/OutputDataSocket.cc | 2 + ceph/src/common/TrackedOp.cc | 8 + ceph/src/common/ceph_crypto.cc | 19 +- ceph/src/common/ceph_crypto.h | 1 + ceph/src/common/crc32c_aarch64.c | 10 +- ceph/src/common/legacy_config_opts.h | 6 + ceph/src/common/options.cc | 47 +- ceph/src/crimson/osd/osd.cc | 23 +- ceph/src/crimson/osd/pg.h | 3 +- ceph/src/include/ceph_fs.h | 85 +- ceph/src/include/cephfs/ceph_ll_client.h | 8 + ceph/src/include/cephfs/libcephfs.h | 25 +- ceph/src/include/compat.h | 10 +- ceph/src/include/types.h | 1 - ceph/src/include/utime.h | 2 +- ceph/src/kv/KeyValueDB.h | 9 +- ceph/src/kv/RocksDBStore.cc | 77 +- ceph/src/kv/RocksDBStore.h | 15 +- ceph/src/librados/IoCtxImpl.cc | 9 +- ceph/src/librados/IoCtxImpl.h | 3 +- ceph/src/librados/ObjectOperationImpl.h | 27 + ceph/src/librados/librados_c.cc | 109 +- ceph/src/librados/librados_cxx.cc | 23 +- ceph/src/librbd/ManagedLock.cc | 27 +- ceph/src/librbd/api/Io.cc | 48 +- ceph/src/librbd/api/Mirror.cc | 2 +- ceph/src/librbd/cache/ImageWriteback.cc | 16 +- .../src/librbd/cache/WriteLogImageDispatch.cc | 20 +- ceph/src/librbd/cache/WriteLogImageDispatch.h | 17 +- ceph/src/librbd/crypto/CryptoImageDispatch.h | 21 +- ceph/src/librbd/crypto/luks/FormatRequest.cc | 3 +- .../librbd/exclusive_lock/ImageDispatch.cc | 20 +- .../src/librbd/exclusive_lock/ImageDispatch.h | 11 +- ceph/src/librbd/io/ImageDispatch.cc | 71 +- ceph/src/librbd/io/ImageDispatch.h | 17 +- ceph/src/librbd/io/ImageDispatchInterface.h | 17 +- ceph/src/librbd/io/ImageDispatchSpec.h | 25 +- ceph/src/librbd/io/ImageDispatcher.cc | 21 +- ceph/src/librbd/io/ImageRequest.cc | 40 +- ceph/src/librbd/io/ImageRequest.h | 64 +- ceph/src/librbd/io/QosImageDispatch.cc | 17 +- ceph/src/librbd/io/QosImageDispatch.h | 17 +- ceph/src/librbd/io/QueueImageDispatch.cc | 15 +- ceph/src/librbd/io/QueueImageDispatch.h | 17 +- ceph/src/librbd/io/RefreshImageDispatch.cc | 17 +- ceph/src/librbd/io/RefreshImageDispatch.h | 17 +- .../io/SimpleSchedulerObjectDispatch.cc | 3 +- ceph/src/librbd/io/WriteBlockImageDispatch.cc | 17 +- ceph/src/librbd/io/WriteBlockImageDispatch.h | 17 +- ceph/src/librbd/journal/Replay.cc | 12 +- .../librbd/managed_lock/GetLockerRequest.cc | 10 +- ceph/src/librbd/migration/ImageDispatch.cc | 11 +- ceph/src/librbd/migration/ImageDispatch.h | 11 +- .../mirror/snapshot/CreatePrimaryRequest.cc | 23 +- .../mirror/snapshot/UnlinkPeerRequest.cc | 81 +- .../mirror/snapshot/UnlinkPeerRequest.h | 17 +- ceph/src/log/Log.cc | 120 +- ceph/src/log/Log.h | 91 +- ceph/src/log/test.cc | 123 + ceph/src/mds/Beacon.cc | 18 + ceph/src/mds/CDentry.cc | 2 + ceph/src/mds/CDentry.h | 6 +- ceph/src/mds/CDir.cc | 1 + ceph/src/mds/CInode.h | 4 +- ceph/src/mds/MDCache.cc | 113 +- ceph/src/mds/MDCache.h | 26 +- ceph/src/mds/MDLog.cc | 13 +- ceph/src/mds/MDSDaemon.cc | 3 +- ceph/src/mds/MDSRank.cc | 19 +- ceph/src/mds/MDSRank.h | 27 +- ceph/src/mds/Migrator.cc | 2 - ceph/src/mds/ScrubHeader.h | 6 +- ceph/src/mds/ScrubStack.cc | 43 +- ceph/src/mds/ScrubStack.h | 2 + ceph/src/mds/Server.cc | 245 +- ceph/src/mds/Server.h | 19 +- ceph/src/mds/SessionMap.cc | 3 + ceph/src/mds/SessionMap.h | 5 + ceph/src/mds/StrayManager.cc | 42 +- ceph/src/mds/cephfs_features.cc | 1 + ceph/src/mds/cephfs_features.h | 4 +- ceph/src/mds/events/EMetaBlob.h | 2 +- ceph/src/mds/journal.cc | 64 +- ceph/src/mds/mdstypes.cc | 13 +- ceph/src/messages/MClientReply.h | 6 +- ceph/src/messages/MClientRequest.h | 36 +- ceph/src/messages/MMDSBeacon.h | 4 + ceph/src/messages/MMgrBeacon.h | 41 +- ceph/src/messages/MOSDMap.h | 40 +- ceph/src/mgr/ActivePyModules.cc | 4 - ceph/src/mgr/PyModuleRegistry.h | 10 +- ceph/src/mon/ConfigMonitor.cc | 1 + ceph/src/mon/FSCommands.cc | 8 + ceph/src/mon/MDSMonitor.cc | 31 +- ceph/src/mon/MgrMap.h | 52 +- ceph/src/mon/MgrMonitor.cc | 64 +- ceph/src/mon/MgrMonitor.h | 8 +- ceph/src/mon/MonClient.cc | 5 + ceph/src/mon/Monitor.cc | 11 +- ceph/src/mon/OSDMonitor.cc | 22 +- ceph/src/mon/PaxosService.cc | 2 +- ceph/src/mon/PaxosService.h | 3 +- ceph/src/os/bluestore/Allocator.cc | 4 +- ceph/src/os/bluestore/Allocator.h | 8 +- ceph/src/os/bluestore/AvlAllocator.cc | 30 +- ceph/src/os/bluestore/AvlAllocator.h | 11 +- ceph/src/os/bluestore/BitmapAllocator.cc | 25 +- ceph/src/os/bluestore/BitmapAllocator.h | 1 - ceph/src/os/bluestore/BlueFS.cc | 1981 ++++--- ceph/src/os/bluestore/BlueFS.h | 231 +- ceph/src/os/bluestore/BlueRocksEnv.cc | 7 +- ceph/src/os/bluestore/BlueStore.cc | 62 +- ceph/src/os/bluestore/BlueStore.h | 9 - ceph/src/os/bluestore/StupidAllocator.cc | 14 +- ceph/src/os/bluestore/StupidAllocator.h | 5 +- ceph/src/os/bluestore/bluefs_types.cc | 1 - ceph/src/os/bluestore/bluefs_types.h | 27 +- ceph/src/osd/OSD.cc | 49 +- ceph/src/osd/OSD.h | 1 - ceph/src/osd/OSDCap.cc | 6 +- ceph/src/osd/OSDMap.cc | 12 + ceph/src/osd/OSDMap.h | 1 + ceph/src/osd/PG.cc | 4 +- ceph/src/osd/PG.h | 2 +- ceph/src/osd/PeeringState.cc | 6 +- ceph/src/osd/PeeringState.h | 2 +- ceph/src/osd/osd_types.cc | 12 +- ceph/src/osd/osd_types.h | 3 + ceph/src/osdc/Objecter.cc | 7 +- ceph/src/perfglue/cpu_profiler.cc | 4 +- ceph/src/pybind/ceph_argparse.py | 16 +- ceph/src/pybind/cephfs/cephfs.pyx | 9 +- ceph/src/pybind/mgr/cephadm/module.py | 53 +- ceph/src/pybind/mgr/cephadm/serve.py | 8 + .../mgr/cephadm/services/cephadmservice.py | 14 +- .../pybind/mgr/cephadm/services/monitoring.py | 9 +- ceph/src/pybind/mgr/cephadm/services/osd.py | 10 +- ceph/src/pybind/mgr/cephadm/tests/fixtures.py | 4 +- .../pybind/mgr/cephadm/tests/test_cephadm.py | 139 +- .../mgr/cephadm/tests/test_scheduling.py | 34 +- .../pybind/mgr/cephadm/tests/test_services.py | 10 +- .../pybind/mgr/cephadm/tests/test_upgrade.py | 6 +- ceph/src/pybind/mgr/cephadm/utils.py | 2 +- .../mgr/dashboard/ci/cephadm/start-cluster.sh | 14 +- .../pybind/mgr/dashboard/controllers/rbd.py | 11 +- .../dashboard/controllers/rbd_mirroring.py | 2 +- .../pybind/mgr/dashboard/controllers/saml2.py | 5 +- .../workflow/06-cluster-check.e2e-spec.ts | 5 + .../dist/en-US/483.57cfde62253651646349.js | 1 + .../dist/en-US/483.e54c767c9033c13a1c71.js | 1 - .../dashboard/frontend/dist/en-US/index.html | 2 +- ...328481.js => main.e617fb1a44bc4fbd5b8c.js} | 2 +- ...afe.js => runtime.57d4c22827fd93a5134f.js} | 2 +- .../dashboard/frontend/proxy.conf.json.sample | 5 + .../rbd-form/rbd-form-edit-request.model.ts | 1 + .../block/rbd-list/rbd-list.component.html | 10 + .../ceph/block/rbd-list/rbd-list.component.ts | 30 +- .../rbd-snapshot-form-modal.component.html | 22 +- .../rbd-snapshot-form-modal.component.ts | 12 +- .../service-form/service-form.component.html | 59 + .../service-form.component.spec.ts | 42 +- .../service-form/service-form.component.ts | 13 +- .../src/app/shared/api/rbd.service.spec.ts | 7 +- .../src/app/shared/api/rbd.service.ts | 9 +- .../app/shared/models/service.interface.ts | 2 + ceph/src/pybind/mgr/dashboard/module.py | 2 +- ceph/src/pybind/mgr/dashboard/openapi.yaml | 6 + ceph/src/pybind/mgr/mgr_module.py | 1 + ceph/src/pybind/mgr/mgr_util.py | 10 - ceph/src/pybind/mgr/nfs/export.py | 15 +- ceph/src/pybind/mgr/nfs/tests/test_nfs.py | 3 +- ceph/src/pybind/mgr/nfs/utils.py | 20 + .../src/pybind/mgr/orchestrator/_interface.py | 4 +- ceph/src/pybind/mgr/orchestrator/module.py | 12 +- ceph/src/pybind/mgr/pg_autoscaler/module.py | 5 +- .../rbd_support/mirror_snapshot_schedule.py | 27 +- ceph/src/pybind/mgr/rbd_support/module.py | 57 +- ceph/src/pybind/mgr/rbd_support/perf.py | 17 +- ceph/src/pybind/mgr/rbd_support/schedule.py | 2 + ceph/src/pybind/mgr/rbd_support/task.py | 23 +- .../mgr/rbd_support/trash_purge_schedule.py | 24 +- .../mgr/snap_schedule/fs/schedule_client.py | 18 +- ceph/src/pybind/mgr/snap_schedule/module.py | 36 +- ceph/src/pybind/mgr/volumes/fs/fs_util.py | 5 +- .../mgr/volumes/fs/operations/volume.py | 4 +- ceph/src/pybind/mgr/volumes/fs/volume.py | 9 - ceph/src/pybind/rados/rados.pyx | 9 + ceph/src/pybind/rbd/rbd.pyx | 12 +- .../ceph/deployment/drive_group.py | 6 +- .../deployment/drive_selection/selector.py | 7 +- .../ceph/deployment/service_spec.py | 98 +- ceph/src/rgw/rgw_admin.cc | 7 - ceph/src/rgw/rgw_ldap.h | 4 +- ceph/src/rgw/rgw_op.cc | 2 +- ceph/src/rgw/rgw_rados.cc | 175 +- ceph/src/rgw/rgw_rados.h | 20 +- ceph/src/rgw/rgw_sal_rados.cc | 2 +- ceph/src/test/cls_rbd/test_cls_rbd.cc | 8 +- ceph/src/test/libcephfs/CMakeLists.txt | 14 + ceph/src/test/libcephfs/suidsgid.cc | 331 ++ ceph/src/test/libcephfs/test.cc | 102 - ceph/src/test/libcephfs/vxattr.cc | 4 - ceph/src/test/librados/TestCase.cc | 13 +- ceph/src/test/librados/aio.cc | 37 +- ceph/src/test/librados/aio_cxx.cc | 311 +- ceph/src/test/librados/test_shared.h | 2 +- ceph/src/test/librados/testcase_cxx.cc | 25 +- ceph/src/test/librados/watch_notify.cc | 7 +- .../librados_test_stub/LibradosTestStub.cc | 4 +- .../librados_test_stub/MockTestMemIoCtxImpl.h | 12 +- .../librados_test_stub/NeoradosTestStub.cc | 2 +- .../test/librados_test_stub/TestIoCtxImpl.cc | 2 +- .../test/librados_test_stub/TestIoCtxImpl.h | 2 +- .../test/librados_test_stub/TestMemCluster.cc | 9 +- .../librados_test_stub/TestWatchNotify.cc | 11 +- .../test/librbd/io/test_mock_ImageRequest.cc | 41 +- .../test/librbd/journal/test_mock_Replay.cc | 15 +- .../test_mock_GetLockerRequest.cc | 44 + .../test_mock_CreatePrimaryRequest.cc | 71 +- .../snapshot/test_mock_UnlinkPeerRequest.cc | 136 +- .../test/librbd/mock/io/MockImageDispatch.h | 17 +- ceph/src/test/librbd/test_internal.cc | 72 + ceph/src/test/librbd/test_mirroring.cc | 4 +- ceph/src/test/librbd/test_mock_ManagedLock.cc | 29 +- ceph/src/test/mon/test_mon_workloadgen.cc | 6 +- ceph/src/test/objectstore/store_test.cc | 21 +- ceph/src/test/objectstore/test_bluefs.cc | 224 +- ceph/src/test/osd/osdcap.cc | 43 + ceph/src/test/pybind/test_ceph_argparse.py | 28 +- ceph/src/test/pybind/test_rbd.py | 6 +- .../snapshot/test_mock_Replayer.cc | 25 +- .../test_mock_BootstrapRequest.cc | 53 + .../rbd_mirror/test_mock_ImageReplayer.cc | 39 + .../test_mock_MirrorStatusUpdater.cc | 12 +- ceph/src/test/rgw/rgw_multi/tests_ps.py | 4958 ----------------- ceph/src/test/rgw/rgw_multi/zone_ps.py | 428 -- ceph/src/test/rgw/test_multi.md | 3 - ceph/src/test/rgw/test_multi.py | 26 +- ceph/src/test/system/systest_runnable.cc | 6 +- ceph/src/tools/ceph-dencoder/rbd_types.h | 2 +- ceph/src/tools/cephfs/DataScan.cc | 254 +- ceph/src/tools/cephfs/DataScan.h | 2 + ceph/src/tools/cephfs/top/CMakeLists.txt | 4 + ceph/src/tools/cephfs/top/cephfs-top | 361 +- ceph/src/tools/kvstore_tool.cc | 2 +- ceph/src/tools/rbd_mirror/ImageReplayer.cc | 20 +- .../image_replayer/snapshot/Replayer.cc | 7 +- ceph/src/vstart.sh | 5 + 409 files changed, 11252 insertions(+), 9636 deletions(-) create mode 100644 ceph/doc/dev/osd_internals/past_intervals.rst create mode 100644 ceph/qa/suites/fs/mirror-ha/cephfs-mirror/+ create mode 100644 ceph/qa/suites/fs/mirror-ha/cephfs-mirror/1-volume-create-rm.yaml rename ceph/qa/suites/fs/mirror-ha/cephfs-mirror/{three-per-cluster.yaml => 2-three-per-cluster.yaml} (100%) create mode 100644 ceph/qa/suites/fs/workload/subvolume/$ create mode 100644 ceph/qa/suites/fs/workload/subvolume/no-subvolume.yaml create mode 100644 ceph/qa/suites/fs/workload/subvolume/with-namespace-isolated-and-quota.yaml create mode 100644 ceph/qa/suites/fs/workload/subvolume/with-namespace-isolated.yaml create mode 100644 ceph/qa/suites/fs/workload/subvolume/with-no-extra-options.yaml create mode 100644 ceph/qa/suites/fs/workload/subvolume/with-quota.yaml create mode 100644 ceph/qa/suites/krbd/thrash/workloads/krbd_diff_continuous.yaml create mode 120000 ceph/qa/suites/orch/cephadm/workunits/task/.qa create mode 100644 ceph/qa/suites/orch/cephadm/workunits/task/test_iscsi_container/+ create mode 120000 ceph/qa/suites/orch/cephadm/workunits/task/test_iscsi_container/.qa create mode 120000 ceph/qa/suites/orch/cephadm/workunits/task/test_iscsi_container/centos_8.stream_container_tools.yaml rename ceph/qa/suites/orch/cephadm/workunits/task/{test_iscsi_pids_limit.yaml => test_iscsi_container/test_iscsi_container.yaml} (88%) delete mode 120000 ceph/qa/suites/rados/rook create mode 100644 ceph/qa/suites/rbd/nbd/workloads/rbd_nbd_diff_continuous.yaml rename ceph/qa/suites/rgw/multisite/realms/{three-zone-plus-pubsub.yaml => three-zones.yaml} (84%) create mode 100644 ceph/qa/suites/rgw/verify/tasks/versioning.yaml create mode 100644 ceph/qa/tasks/cephfs/test_subvolume.py create mode 100755 ceph/qa/workunits/cephadm/test_iscsi_etc_hosts.sh delete mode 100755 ceph/qa/workunits/fs/misc/subvolume.sh create mode 100755 ceph/qa/workunits/rgw/common.py create mode 100755 ceph/qa/workunits/rgw/run-versioning.sh create mode 100755 ceph/qa/workunits/rgw/test_rgw_versioning.py create mode 100644 ceph/src/librados/ObjectOperationImpl.h create mode 100644 ceph/src/pybind/mgr/dashboard/frontend/dist/en-US/483.57cfde62253651646349.js delete mode 100644 ceph/src/pybind/mgr/dashboard/frontend/dist/en-US/483.e54c767c9033c13a1c71.js rename ceph/src/pybind/mgr/dashboard/frontend/dist/en-US/{main.863ed935b3f00f328481.js => main.e617fb1a44bc4fbd5b8c.js} (65%) rename ceph/src/pybind/mgr/dashboard/frontend/dist/en-US/{runtime.89a9f685232e870f1afe.js => runtime.57d4c22827fd93a5134f.js} (63%) create mode 100644 ceph/src/test/libcephfs/suidsgid.cc delete mode 100644 ceph/src/test/rgw/rgw_multi/tests_ps.py delete mode 100644 ceph/src/test/rgw/rgw_multi/zone_ps.py diff --git a/ceph/CMakeLists.txt b/ceph/CMakeLists.txt index 730f6d3db..561ce9284 100644 --- a/ceph/CMakeLists.txt +++ b/ceph/CMakeLists.txt @@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.10.2) # remove cmake/modules/FindPython* once 3.12 is required project(ceph - VERSION 16.2.13 + VERSION 16.2.14 LANGUAGES CXX C ASM) foreach(policy diff --git a/ceph/PendingReleaseNotes b/ceph/PendingReleaseNotes index cba4e03dd..8322ba3ad 100644 --- a/ceph/PendingReleaseNotes +++ b/ceph/PendingReleaseNotes @@ -32,6 +32,17 @@ in certain recovery scenarios, e.g., monitor database lost and rebuilt, and the restored file system is expected to have the same ID as before. +>= 16.2.14 +---------- + +* CEPHFS: After recovering a Ceph File System post following the disaster recovery + procedure, the recovered files under `lost+found` directory can now be deleted. + +* `ceph mgr dump` command now displays the name of the mgr module that + registered a RADOS client in the `name` field added to elements of the + `active_clients` array. Previously, only the address of a module's RADOS + client was shown in the `active_clients` array. + >=16.2.12 --------- @@ -62,6 +73,65 @@ namespaces was added to RBD in Nautilus 14.2.0 and it has been possible to map and unmap images in namespaces using the `image-spec` syntax since then but the corresponding option available in most other commands was missing. +* RGW: Compression is now supported for objects uploaded with Server-Side Encryption. + When both are enabled, compression is applied before encryption. +* RGW: the "pubsub" functionality for storing bucket notifications inside Ceph + is removed. Together with it, the "pubsub" zone should not be used anymore. + The REST operations, as well as radosgw-admin commands for manipulating + subscriptions, as well as fetching and acking the notifications are removed + as well. + In case that the endpoint to which the notifications are sent maybe down or + disconnected, it is recommended to use persistent notifications to guarantee + the delivery of the notifications. In case the system that consumes the + notifications needs to pull them (instead of the notifications be pushed + to it), an external message bus (e.g. rabbitmq, Kafka) should be used for + that purpose. +* RGW: The serialized format of notification and topics has changed, so that + new/updated topics will be unreadable by old RGWs. We recommend completing + the RGW upgrades before creating or modifying any notification topics. +* RBD: Trailing newline in passphrase files (`` argument in + `rbd encryption format` command and `--encryption-passphrase-file` option + in other commands) is no longer stripped. +* RBD: Support for layered client-side encryption is added. Cloned images + can now be encrypted each with its own encryption format and passphrase, + potentially different from that of the parent image. The efficient + copy-on-write semantics intrinsic to unformatted (regular) cloned images + are retained. +* CEPHFS: Rename the `mds_max_retries_on_remount_failure` option to + `client_max_retries_on_remount_failure` and move it from mds.yaml.in to + mds-client.yaml.in because this option was only used by MDS client from its + birth. +* The `perf dump` and `perf schema` commands are deprecated in favor of new + `counter dump` and `counter schema` commands. These new commands add support + for labeled perf counters and also emit existing unlabeled perf counters. Some + unlabeled perf counters became labeled in this release, with more to follow in + future releases; such converted perf counters are no longer emitted by the + `perf dump` and `perf schema` commands. +* `ceph mgr dump` command now outputs `last_failure_osd_epoch` and + `active_clients` fields at the top level. Previously, these fields were + output under `always_on_modules` field. +* RBD: All rbd-mirror daemon perf counters became labeled and as such are now + emitted only by the new `counter dump` and `counter schema` commands. As part + of the conversion, many also got renamed to better disambiguate journal-based + and snapshot-based mirroring. +* RBD: list-watchers C++ API (`Image::list_watchers`) now clears the passed + `std::list` before potentially appending to it, aligning with the semantics + of the corresponding C API (`rbd_watchers_list`). +* Telemetry: Users who are opted-in to telemetry can also opt-in to + participating in a leaderboard in the telemetry public + dashboards (https://telemetry-public.ceph.com/). Users can now also add a + description of the cluster to publicly appear in the leaderboard. + For more details, see: + https://docs.ceph.com/en/latest/mgr/telemetry/#leaderboard + See a sample report with `ceph telemetry preview`. + Opt-in to telemetry with `ceph telemetry on`. + Opt-in to the leaderboard with + `ceph config set mgr mgr/telemetry/leaderboard true`. + Add leaderboard description with: + `ceph config set mgr mgr/telemetry/leaderboard_description ‘Cluster description’`. +* CEPHFS: After recovering a Ceph File System post following the disaster recovery + procedure, the recovered files under `lost+found` directory can now be deleted. +* core: cache-tiering is now deprecated. >=16.2.8 -------- diff --git a/ceph/ceph.spec b/ceph/ceph.spec index 2bfdbc9ad..912c5f409 100644 --- a/ceph/ceph.spec +++ b/ceph/ceph.spec @@ -135,7 +135,7 @@ # main package definition ################################################################################# Name: ceph -Version: 16.2.13 +Version: 16.2.14 Release: 0%{?dist} %if 0%{?fedora} || 0%{?rhel} Epoch: 2 @@ -151,7 +151,7 @@ License: LGPL-2.1 and LGPL-3.0 and CC-BY-SA-3.0 and GPL-2.0 and BSL-1.0 and BSD- Group: System/Filesystems %endif URL: http://ceph.com/ -Source0: %{?_remote_tarball_prefix}ceph-16.2.13.tar.bz2 +Source0: %{?_remote_tarball_prefix}ceph-16.2.14.tar.bz2 %if 0%{?suse_version} # _insert_obs_source_lines_here ExclusiveArch: x86_64 aarch64 ppc64le s390x @@ -1208,7 +1208,7 @@ This package provides Ceph default alerts for Prometheus. # common ################################################################################# %prep -%autosetup -p1 -n ceph-16.2.13 +%autosetup -p1 -n ceph-16.2.14 %build # Disable lto on systems that do not support symver attribute diff --git a/ceph/changelog.upstream b/ceph/changelog.upstream index 774c39ca2..5c9040201 100644 --- a/ceph/changelog.upstream +++ b/ceph/changelog.upstream @@ -1,7 +1,13 @@ -ceph (16.2.13-1focal) focal; urgency=medium +ceph (16.2.14-1focal) focal; urgency=medium - -- Jenkins Build Slave User Mon, 08 May 2023 20:49:59 +0000 + -- Jenkins Build Slave User Tue, 29 Aug 2023 16:38:35 +0000 + +ceph (16.2.14-1) stable; urgency=medium + + * New upstream release + + -- Ceph Release Team Tue, 29 Aug 2023 15:43:56 +0000 ceph (16.2.13-1) stable; urgency=medium diff --git a/ceph/debian/cephfs-mirror.install b/ceph/debian/cephfs-mirror.install index 19d2e483d..ca0807e27 100644 --- a/ceph/debian/cephfs-mirror.install +++ b/ceph/debian/cephfs-mirror.install @@ -1 +1,3 @@ +lib/systemd/system/cephfs-mirror* usr/bin/cephfs-mirror +usr/share/man/man8/cephfs-mirror.8 diff --git a/ceph/doc/cephadm/operations.rst b/ceph/doc/cephadm/operations.rst index 23b396b51..09b6219c1 100644 --- a/ceph/doc/cephadm/operations.rst +++ b/ceph/doc/cephadm/operations.rst @@ -43,17 +43,17 @@ monitor hosts as well as to the monitor daemons' stderr. Ceph daemon logs ================ -Logging to journald -------------------- +Logging to stdout +----------------- -Ceph daemons traditionally write logs to ``/var/log/ceph``. Ceph daemons log to -journald by default and Ceph logs are captured by the container runtime -environment. They are accessible via ``journalctl``. +Ceph daemons traditionally write logs to ``/var/log/ceph``. Ceph +daemons log to stderr by default and Ceph logs are captured by the +container runtime environment. By default, most systems send these +logs to journald, which means that they are accessible via +``journalctl``. -.. note:: Prior to Quincy, ceph daemons logged to stderr. - -Example of logging to journald -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Example of logging to stdout +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ For example, to view the logs for the daemon ``mon.foo`` for a cluster with ID ``5c5a50ae-272a-455d-99e9-32c6a013e694``, the command would be @@ -69,11 +69,11 @@ Logging to files ---------------- You can also configure Ceph daemons to log to files instead of to -journald if you prefer logs to appear in files (as they did in earlier, +stderr if you prefer logs to appear in files (as they did in earlier, pre-cephadm, pre-Octopus versions of Ceph). When Ceph logs to files, the logs appear in ``/var/log/ceph/``. If you choose to -configure Ceph to log to files instead of to journald, remember to -configure Ceph so that it will not log to journald (the commands for +configure Ceph to log to files instead of to stderr, remember to +configure Ceph so that it will not log to stderr (the commands for this are covered below). Enabling logging to files @@ -86,10 +86,10 @@ To enable logging to files, run the following commands: ceph config set global log_to_file true ceph config set global mon_cluster_log_to_file true -Disabling logging to journald -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Disabling logging to stderr +~~~~~~~~~~~~~~~~~~~~~~~~~~~ -If you choose to log to files, we recommend disabling logging to journald or else +If you choose to log to files, we recommend disabling logging to stderr or else everything will be logged twice. Run the following commands to disable logging to stderr: @@ -97,11 +97,6 @@ to stderr: ceph config set global log_to_stderr false ceph config set global mon_cluster_log_to_stderr false - ceph config set global log_to_journald false - ceph config set global mon_cluster_log_to_journald false - -.. note:: You can change the default by passing --log-to-file during - bootstrapping a new cluster. Modifying the log retention schedule ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/ceph/doc/cephadm/services/index.rst b/ceph/doc/cephadm/services/index.rst index 317fc4c69..44a09c987 100644 --- a/ceph/doc/cephadm/services/index.rst +++ b/ceph/doc/cephadm/services/index.rst @@ -558,6 +558,7 @@ For example: Extra Entrypoint Arguments ========================== + .. note:: For arguments intended for the container runtime rather than the process inside @@ -577,6 +578,57 @@ the node-exporter service , one could apply a service spec like extra_entrypoint_args: - "--collector.textfile.directory=/var/lib/node_exporter/textfile_collector2" +Custom Config Files +=================== + +Cephadm supports specifying miscellaneous config files for daemons. +To do so, users must provide both the content of the config file and the +location within the daemon's container at which it should be mounted. After +applying a YAML spec with custom config files specified and having cephadm +redeploy the daemons for which the config files are specified, these files will +be mounted within the daemon's container at the specified location. + +Example service spec: + +.. code-block:: yaml + + service_type: grafana + service_name: grafana + custom_configs: + - mount_path: /etc/example.conf + content: | + setting1 = value1 + setting2 = value2 + - mount_path: /usr/share/grafana/example.cert + content: | + -----BEGIN PRIVATE KEY----- + V2VyIGRhcyBsaWVzdCBpc3QgZG9vZi4gTG9yZW0gaXBzdW0gZG9sb3Igc2l0IGFt + ZXQsIGNvbnNldGV0dXIgc2FkaXBzY2luZyBlbGl0ciwgc2VkIGRpYW0gbm9udW15 + IGVpcm1vZCB0ZW1wb3IgaW52aWR1bnQgdXQgbGFib3JlIGV0IGRvbG9yZSBtYWdu + YSBhbGlxdXlhbSBlcmF0LCBzZWQgZGlhbSB2b2x1cHR1YS4gQXQgdmVybyBlb3Mg + ZXQgYWNjdXNhbSBldCBqdXN0byBkdW8= + -----END PRIVATE KEY----- + -----BEGIN CERTIFICATE----- + V2VyIGRhcyBsaWVzdCBpc3QgZG9vZi4gTG9yZW0gaXBzdW0gZG9sb3Igc2l0IGFt + ZXQsIGNvbnNldGV0dXIgc2FkaXBzY2luZyBlbGl0ciwgc2VkIGRpYW0gbm9udW15 + IGVpcm1vZCB0ZW1wb3IgaW52aWR1bnQgdXQgbGFib3JlIGV0IGRvbG9yZSBtYWdu + YSBhbGlxdXlhbSBlcmF0LCBzZWQgZGlhbSB2b2x1cHR1YS4gQXQgdmVybyBlb3Mg + ZXQgYWNjdXNhbSBldCBqdXN0byBkdW8= + -----END CERTIFICATE----- + +To make these new config files actually get mounted within the +containers for the daemons + +.. prompt:: bash + + ceph orch redeploy + +For example: + +.. prompt:: bash + + ceph orch redeploy grafana + .. _orch-rm: Removing a Service diff --git a/ceph/doc/cephadm/services/monitoring.rst b/ceph/doc/cephadm/services/monitoring.rst index 86e3e3f69..f29a93e82 100644 --- a/ceph/doc/cephadm/services/monitoring.rst +++ b/ceph/doc/cephadm/services/monitoring.rst @@ -299,13 +299,16 @@ and the metrics will not be visible in Prometheus. Setting up Prometheus ----------------------- -Setting Prometheus Retention Time -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Setting Prometheus Retention Size and Time +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Cephadm provides the option to set the Prometheus TDSB retention time using -a ``retention_time`` field in the Prometheus service spec. The value defaults -to 15 days (15d). If you would like a different value, such as 1 year (1y) you -can apply a service spec similar to: +Cephadm can configure Prometheus TSDB retention by specifying ``retention_time`` +and ``retention_size`` values in the Prometheus service spec. +The retention time value defaults to 15 days (15d). Users can set a different value/unit where +supported units are: 'y', 'w', 'd', 'h', 'm' and 's'. The retention size value defaults +to 0 (disabled). Supported units in this case are: 'B', 'KB', 'MB', 'GB', 'TB', 'PB' and 'EB'. + +In the following example spec we set the retention time to 1 year and the size to 1GB. .. code-block:: yaml @@ -314,6 +317,7 @@ can apply a service spec similar to: count: 1 spec: retention_time: "1y" + retention_size: "1GB" .. note:: diff --git a/ceph/doc/cephadm/services/osd.rst b/ceph/doc/cephadm/services/osd.rst index de0d4f82a..f60956384 100644 --- a/ceph/doc/cephadm/services/osd.rst +++ b/ceph/doc/cephadm/services/osd.rst @@ -308,7 +308,7 @@ Replacing an OSD .. prompt:: bash # - orch osd rm --replace [--force] + ceph orch osd rm --replace [--force] Example: diff --git a/ceph/doc/cephfs/cephfs-mirroring.rst b/ceph/doc/cephfs/cephfs-mirroring.rst index 3dbaa5d1a..7d5dfa6cf 100644 --- a/ceph/doc/cephfs/cephfs-mirroring.rst +++ b/ceph/doc/cephfs/cephfs-mirroring.rst @@ -14,6 +14,8 @@ Requirements The primary (local) and secondary (remote) Ceph clusters version should be Pacific or later. +.. _cephfs_mirroring_creating_users: + Creating Users -------------- @@ -42,80 +44,155 @@ Mirror daemon should be spawned using `systemctl(1)` unit files:: $ cephfs-mirror --id mirror --cluster site-a -f -.. note:: User used here is `mirror` created in the `Creating Users` section. +.. note:: The user specified here is `mirror`, the creation of which is + described in the :ref:`Creating Users` + section. + +Multiple ``cephfs-mirror`` daemons may be deployed for concurrent +synchronization and high availability. Mirror daemons share the synchronization +load using a simple ``M/N`` policy, where ``M`` is the number of directories +and ``N`` is the number of ``cephfs-mirror`` daemons. + +When ``cephadm`` is used to manage a Ceph cluster, ``cephfs-mirror`` daemons can be +deployed by running the following command: + +.. prompt:: bash $ + + ceph orch apply cephfs-mirror + +To deploy multiple mirror daemons, run a command of the following form: + +.. prompt:: bash $ + + ceph orch apply cephfs-mirror --placement= + +For example, to deploy 3 `cephfs-mirror` daemons on different hosts, run a command of the following form: + +.. prompt:: bash $ + + $ ceph orch apply cephfs-mirror --placement="3 host1,host2,host3" Interface --------- -`Mirroring` module (manager plugin) provides interfaces for managing directory snapshot -mirroring. Manager interfaces are (mostly) wrappers around monitor commands for managing -file system mirroring and is the recommended control interface. +The `Mirroring` module (manager plugin) provides interfaces for managing +directory snapshot mirroring. These are (mostly) wrappers around monitor +commands for managing file system mirroring and is the recommended control +interface. Mirroring Module ---------------- -The mirroring module is responsible for assigning directories to mirror daemons for -synchronization. Multiple mirror daemons can be spawned to achieve concurrency in -directory snapshot synchronization. When mirror daemons are spawned (or terminated) -, the mirroring module discovers the modified set of mirror daemons and rebalances -the directory assignment amongst the new set thus providing high-availability. +The mirroring module is responsible for assigning directories to mirror daemons +for synchronization. Multiple mirror daemons can be spawned to achieve +concurrency in directory snapshot synchronization. When mirror daemons are +spawned (or terminated), the mirroring module discovers the modified set of +mirror daemons and rebalances directory assignments across the new set, thus +providing high-availability. -.. note:: Multiple mirror daemons is currently untested. Only a single mirror daemon - is recommended. +.. note:: Deploying a single mirror daemon is recommended. Running multiple + daemons is untested. -Mirroring module is disabled by default. To enable mirroring use:: +The mirroring module is disabled by default. To enable the mirroring module, +run the following command: - $ ceph mgr module enable mirroring +.. prompt:: bash $ -Mirroring module provides a family of commands to control mirroring of directory -snapshots. To add or remove directories, mirroring needs to be enabled for a given -file system. To enable mirroring use:: + ceph mgr module enable mirroring - $ ceph fs snapshot mirror enable +The mirroring module provides a family of commands that can be used to control +the mirroring of directory snapshots. To add or remove directories, mirroring +must be enabled for a given file system. To enable mirroring for a given file +system, run a command of the following form: -.. note:: Mirroring module commands use `fs snapshot mirror` prefix as compared to - the monitor commands which `fs mirror` prefix. Make sure to use module - commands. +.. prompt:: bash $ -To disable mirroring, use:: + ceph fs snapshot mirror enable - $ ceph fs snapshot mirror disable +.. note:: "Mirroring module" commands are prefixed with ``fs snapshot mirror``. + This distinguishes them from "monitor commands", which are prefixed with ``fs + mirror``. Be sure (in this context) to use module commands. -Once mirroring is enabled, add a peer to which directory snapshots are to be mirrored. -Peers follow `@` specification and get assigned a unique-id (UUID) -when added. See `Creating Users` section on how to create Ceph users for mirroring. +To disable mirroring for a given file system, run a command of the following form: -To add a peer use:: +.. prompt:: bash $ - $ ceph fs snapshot mirror peer_add [] [] [] + ceph fs snapshot mirror disable -`` is optional, and defaults to `` (on the remote cluster). +After mirroring is enabled, add a peer to which directory snapshots are to be +mirrored. Peers are specified by the ``@`` format, which is +referred to elsewhere in this document as the ``remote_cluster_spec``. Peers +are assigned a unique-id (UUID) when added. See the :ref:`Creating +Users` section for instructions that describe +how to create Ceph users for mirroring. -This requires the remote cluster ceph configuration and user keyring to be available in -the primary cluster. See `Bootstrap Peers` section to avoid this. `peer_add` additionally -supports passing the remote cluster monitor address and the user key. However, bootstrapping -a peer is the recommended way to add a peer. +To add a peer, run a command of the following form: + +.. prompt:: bash $ + + ceph fs snapshot mirror peer_add [] [] [] + +```` is of the format ``client.@``. + +```` is optional, and defaults to `` (on the remote +cluster). + +For this command to succeed, the remote cluster's Ceph configuration and user +keyring must be available in the primary cluster. For example, if a user named +``client_mirror`` is created on the remote cluster which has ``rwps`` +permissions for the remote file system named ``remote_fs`` (see `Creating +Users`) and the remote cluster is named ``remote_ceph`` (that is, the remote +cluster configuration file is named ``remote_ceph.conf`` on the primary +cluster), run the following command to add the remote filesystem as a peer to +the primary filesystem ``primary_fs``: + +.. prompt:: bash $ + + ceph fs snapshot mirror peer_add primary_fs client.mirror_remote@remote_ceph remote_fs + +To avoid having to maintain the remote cluster configuration file and remote +ceph user keyring in the primary cluster, users can bootstrap a peer (which +stores the relevant remote cluster details in the monitor config store on the +primary cluster). See the :ref:`Bootstrap +Peers` section. + +The ``peer_add`` command supports passing the remote cluster monitor address +and the user key. However, bootstrapping a peer is the recommended way to add a +peer. .. note:: Only a single peer is supported right now. -To remove a peer use:: +To remove a peer, run a command of the following form: - $ ceph fs snapshot mirror peer_remove +.. prompt:: bash $ -To list file system mirror peers use:: + ceph fs snapshot mirror peer_remove - $ ceph fs snapshot mirror peer_list +To list file system mirror peers, run a command of the following form: -To configure a directory for mirroring, use:: +.. prompt:: bash $ - $ ceph fs snapshot mirror add + ceph fs snapshot mirror peer_list -To stop a mirroring directory snapshots use:: +To configure a directory for mirroring, run a command of the following form: - $ ceph fs snapshot mirror remove +.. prompt:: bash $ -Only absolute directory paths are allowed. Also, paths are normalized by the mirroring -module, therfore, `/a/b/../b` is equivalent to `/a/b`. + ceph fs snapshot mirror add + +To stop mirroring directory snapshots, run a command of the following form: + +.. prompt:: bash $ + + ceph fs snapshot mirror remove + +Only absolute directory paths are allowed. + +Paths are normalized by the mirroring module. This means that ``/a/b/../b`` is +equivalent to ``/a/b``. Paths always start from the CephFS file-system root and +not from the host system mount point. + +For example:: $ mkdir -p /d0/d1/d2 $ ceph fs snapshot mirror add cephfs /d0/d1/d2 @@ -123,16 +200,19 @@ module, therfore, `/a/b/../b` is equivalent to `/a/b`. $ ceph fs snapshot mirror add cephfs /d0/d1/../d1/d2 Error EEXIST: directory /d0/d1/d2 is already tracked -Once a directory is added for mirroring, its subdirectory or ancestor directories are -disallowed to be added for mirorring:: +After a directory is added for mirroring, the additional mirroring of +subdirectories or ancestor directories is disallowed:: $ ceph fs snapshot mirror add cephfs /d0/d1 Error EINVAL: /d0/d1 is a ancestor of tracked path /d0/d1/d2 $ ceph fs snapshot mirror add cephfs /d0/d1/d2/d3 Error EINVAL: /d0/d1/d2/d3 is a subtree of tracked path /d0/d1/d2 -Commands to check directory mapping (to mirror daemons) and directory distribution are -detailed in `Mirroring Status` section. +The :ref:`Mirroring Status` section contains +information about the commands for checking the directory mapping (to mirror +daemons) and for checking the directory distribution. + +.. _cephfs_mirroring_bootstrap_peers: Bootstrap Peers --------------- @@ -160,6 +240,9 @@ e.g.:: $ ceph fs snapshot mirror peer_bootstrap import cephfs eyJmc2lkIjogIjBkZjE3MjE3LWRmY2QtNDAzMC05MDc5LTM2Nzk4NTVkNDJlZiIsICJmaWxlc3lzdGVtIjogImJhY2t1cF9mcyIsICJ1c2VyIjogImNsaWVudC5taXJyb3JfcGVlcl9ib290c3RyYXAiLCAic2l0ZV9uYW1lIjogInNpdGUtcmVtb3RlIiwgImtleSI6ICJBUUFhcDBCZ0xtRmpOeEFBVnNyZXozai9YYUV0T2UrbUJEZlJDZz09IiwgIm1vbl9ob3N0IjogIlt2MjoxOTIuMTY4LjAuNTo0MDkxOCx2MToxOTIuMTY4LjAuNTo0MDkxOV0ifQ== + +.. _cephfs_mirroring_mirroring_status: + Mirroring Status ---------------- diff --git a/ceph/doc/cephfs/cephfs-top.rst b/ceph/doc/cephfs/cephfs-top.rst index 857a50948..511d0437c 100644 --- a/ceph/doc/cephfs/cephfs-top.rst +++ b/ceph/doc/cephfs/cephfs-top.rst @@ -78,7 +78,15 @@ By default, `cephfs-top` connects to cluster name `ceph`. To use a non-default c $ cephfs-top -d -Interval should be greater or equal to 0.5 second. Fractional seconds are honoured. +Refresh interval should be a positive integer. + +To dump the metrics to stdout without creating a curses display use:: + + $ cephfs-top --dump + +To dump the metrics of the given filesystem to stdout without creating a curses display use:: + + $ cephfs-top --dumpfs Interactive Commands -------------------- @@ -104,3 +112,5 @@ The metrics display can be scrolled using the Arrow Keys, PgUp/PgDn, Home/End an Sample screenshot running `cephfs-top` with 2 filesystems: .. image:: cephfs-top.png + +.. note:: Minimum compatible python version for cephfs-top is 3.6.0. cephfs-top is supported on distros RHEL 8, Ubuntu 18.04, CentOS 8 and above. diff --git a/ceph/doc/cephfs/disaster-recovery-experts.rst b/ceph/doc/cephfs/disaster-recovery-experts.rst index 9688caa03..d817f6d17 100644 --- a/ceph/doc/cephfs/disaster-recovery-experts.rst +++ b/ceph/doc/cephfs/disaster-recovery-experts.rst @@ -149,8 +149,8 @@ errors. :: - cephfs-data-scan scan_extents - cephfs-data-scan scan_inodes + cephfs-data-scan scan_extents [ [ ...]] + cephfs-data-scan scan_inodes [] cephfs-data-scan scan_links 'scan_extents' and 'scan_inodes' commands may take a *very long* time @@ -166,22 +166,22 @@ The example below shows how to run 4 workers simultaneously: :: # Worker 0 - cephfs-data-scan scan_extents --worker_n 0 --worker_m 4 + cephfs-data-scan scan_extents --worker_n 0 --worker_m 4 # Worker 1 - cephfs-data-scan scan_extents --worker_n 1 --worker_m 4 + cephfs-data-scan scan_extents --worker_n 1 --worker_m 4 # Worker 2 - cephfs-data-scan scan_extents --worker_n 2 --worker_m 4 + cephfs-data-scan scan_extents --worker_n 2 --worker_m 4 # Worker 3 - cephfs-data-scan scan_extents --worker_n 3 --worker_m 4 + cephfs-data-scan scan_extents --worker_n 3 --worker_m 4 # Worker 0 - cephfs-data-scan scan_inodes --worker_n 0 --worker_m 4 + cephfs-data-scan scan_inodes --worker_n 0 --worker_m 4 # Worker 1 - cephfs-data-scan scan_inodes --worker_n 1 --worker_m 4 + cephfs-data-scan scan_inodes --worker_n 1 --worker_m 4 # Worker 2 - cephfs-data-scan scan_inodes --worker_n 2 --worker_m 4 + cephfs-data-scan scan_inodes --worker_n 2 --worker_m 4 # Worker 3 - cephfs-data-scan scan_inodes --worker_n 3 --worker_m 4 + cephfs-data-scan scan_inodes --worker_n 3 --worker_m 4 It is **important** to ensure that all workers have completed the scan_extents phase before any workers enter the scan_inodes phase. @@ -191,8 +191,13 @@ operation to delete ancillary data geneated during recovery. :: - cephfs-data-scan cleanup + cephfs-data-scan cleanup [] +Note, the data pool parameters for 'scan_extents', 'scan_inodes' and +'cleanup' commands are optional, and usually the tool will be able to +detect the pools automatically. Still you may override this. The +'scan_extents' command needs all data pools to be specified, while +'scan_inodes' and 'cleanup' commands need only the main data pool. Using an alternate metadata pool for recovery @@ -250,8 +255,8 @@ Now perform the recovery of the metadata pool from the data pool: :: cephfs-data-scan init --force-init --filesystem cephfs_recovery --alternate-pool cephfs_recovery_meta - cephfs-data-scan scan_extents --alternate-pool cephfs_recovery_meta --filesystem - cephfs-data-scan scan_inodes --alternate-pool cephfs_recovery_meta --filesystem --force-corrupt + cephfs-data-scan scan_extents --alternate-pool cephfs_recovery_meta --filesystem + cephfs-data-scan scan_inodes --alternate-pool cephfs_recovery_meta --filesystem --force-corrupt cephfs-data-scan scan_links --filesystem cephfs_recovery .. note:: diff --git a/ceph/doc/cephfs/fs-volumes.rst b/ceph/doc/cephfs/fs-volumes.rst index e287478ee..10857c50b 100644 --- a/ceph/doc/cephfs/fs-volumes.rst +++ b/ceph/doc/cephfs/fs-volumes.rst @@ -3,23 +3,22 @@ FS volumes and subvolumes ========================= -The volumes -module of the :term:`Ceph Manager` daemon (ceph-mgr) provides a single -source of truth for CephFS exports. The OpenStack shared -file system service (manila_) and Ceph Container Storage Interface (CSI_) -storage administrators among others can use the common CLI provided by the -ceph-mgr volumes module to manage CephFS exports. +The volumes module of the :term:`Ceph Manager` daemon (ceph-mgr) provides a +single source of truth for CephFS exports. The OpenStack shared file system +service (manila_) and the Ceph Container Storage Interface (CSI_) storage +administrators use the common CLI provided by the ceph-mgr ``volumes`` module +to manage CephFS exports. -The ceph-mgr volumes module implements the following file system export -abstactions: +The ceph-mgr ``volumes`` module implements the following file system export +abstractions: * FS volumes, an abstraction for CephFS file systems * FS subvolumes, an abstraction for independent CephFS directory trees * FS subvolume groups, an abstraction for a directory level higher than FS - subvolumes to effect policies (e.g., :doc:`/cephfs/file-layouts`) across a - set of subvolumes + subvolumes. Used to effect policies (e.g., :doc:`/cephfs/file-layouts`) + across a set of subvolumes Some possible use-cases for the export abstractions: @@ -38,67 +37,76 @@ Requirements mon 'allow r' mgr 'allow rw' - FS Volumes ---------- -Create a volume using:: +Create a volume by running the following command: - $ ceph fs volume create [] +.. prompt:: bash $ + + ceph fs volume create [] This creates a CephFS file system and its data and metadata pools. It can also -deploy MDS daemons for the filesystem using a ceph-mgr orchestrator -module (see :doc:`/mgr/orchestrator`), for example Rook. +deploy MDS daemons for the filesystem using a ceph-mgr orchestrator module (for +example Rook). See :doc:`/mgr/orchestrator`. - is the volume name (an arbitrary string), and - is an optional string that designates the hosts that should have -an MDS running on them and, optionally, the total number of MDS daemons the cluster -should have. For example, the -following placement string means "deploy MDS on nodes ``host1`` and ``host2`` (one -MDS per host): +```` is the volume name (an arbitrary string). ```` is an +optional string that specifies the hosts that should have an MDS running on +them and, optionally, the total number of MDS daemons that the cluster should +have. For example, the following placement string means "deploy MDS on nodes +``host1`` and ``host2`` (one MDS per host):: "host1,host2" -and this placement specification says to deploy two MDS daemons on each of -nodes ``host1`` and ``host2`` (for a total of four MDS daemons in the cluster): +The following placement specification means "deploy two MDS daemons on each of +nodes ``host1`` and ``host2`` (for a total of four MDS daemons in the +cluster)":: "4 host1,host2" -For more details on placement specification refer to the :ref:`orchestrator-cli-service-spec`, -but keep in mind that specifying placement via a YAML file is not supported. +See :ref:`orchestrator-cli-service-spec` for more on placement specification. +Specifying placement via a YAML file is not supported. -To remove a volume, run the following command:: +To remove a volume, run the following command: - $ ceph fs volume rm [--yes-i-really-mean-it] +.. prompt:: bash $ + + ceph fs volume rm [--yes-i-really-mean-it] This removes a file system and its data and metadata pools. It also tries to remove MDS daemons using the enabled ceph-mgr orchestrator module. -List volumes using:: +List volumes by running the following command: - $ ceph fs volume ls +.. prompt:: bash $ -Rename a volume using:: + ceph fs volume ls - $ ceph fs volume rename [--yes-i-really-mean-it] +Rename a volume by running the following command: + +.. prompt:: bash $ + + ceph fs volume rename [--yes-i-really-mean-it] Renaming a volume can be an expensive operation that requires the following: -- Rename the orchestrator-managed MDS service to match the . - This involves launching a MDS service with and bringing down - the MDS service with . -- Rename the file system matching to -- Change the application tags on the data and metadata pools of the file system - to -- Rename the metadata and data pools of the file system. +- Renaming the orchestrator-managed MDS service to match the . + This involves launching a MDS service with ```` and bringing + down the MDS service with ````. +- Renaming the file system matching ```` to ````. +- Changing the application tags on the data and metadata pools of the file system + to ````. +- Renaming the metadata and data pools of the file system. -The CephX IDs authorized for need to be reauthorized for . Any -on-going operations of the clients using these IDs may be disrupted. Mirroring is -expected to be disabled on the volume. +The CephX IDs that are authorized for ```` must be reauthorized for +````. Any ongoing operations of the clients using these IDs may +be disrupted. Ensure that mirroring is disabled on the volume. -To fetch the information of a CephFS volume, run:: +To fetch the information of a CephFS volume, run the following command: - $ ceph fs volume info vol_name [--human_readable] +.. prompt:: bash $ + + ceph fs volume info vol_name [--human_readable] The ``--human_readable`` flag shows used and available pool capacities in KB/MB/GB. @@ -142,9 +150,11 @@ Sample output of the ``volume info`` command:: FS Subvolume groups ------------------- -Create a subvolume group using:: +Create a subvolume group by running the following command: - $ ceph fs subvolumegroup create [--size ] [--pool_layout ] [--uid ] [--gid ] [--mode ] +.. prompt:: bash $ + + ceph fs subvolumegroup create [--size ] [--pool_layout ] [--uid ] [--gid ] [--mode ] The command succeeds even if the subvolume group already exists. @@ -152,32 +162,41 @@ When creating a subvolume group you can specify its data pool layout (see :doc:`/cephfs/file-layouts`), uid, gid, file mode in octal numerals, and size in bytes. The size of the subvolume group is specified by setting a quota on it (see :doc:`/cephfs/quota`). By default, the subvolume group -is created with octal file mode '755', uid '0', gid '0' and the data pool +is created with octal file mode ``755``, uid ``0``, gid ``0`` and the data pool layout of its parent directory. +Remove a subvolume group by running a command of the following form: -Remove a subvolume group using:: +.. prompt:: bash $ - $ ceph fs subvolumegroup rm [--force] + ceph fs subvolumegroup rm [--force] -The removal of a subvolume group fails if it is not empty or non-existent. -'--force' flag allows the non-existent subvolume group remove command to succeed. +The removal of a subvolume group fails if the subvolume group is not empty or +is non-existent. The ``--force`` flag allows the non-existent "subvolume group +remove command" to succeed. -Fetch the absolute path of a subvolume group using:: +Fetch the absolute path of a subvolume group by running a command of the +following form: - $ ceph fs subvolumegroup getpath +.. prompt:: bash $ -List subvolume groups using:: + ceph fs subvolumegroup getpath - $ ceph fs subvolumegroup ls +List subvolume groups by running a command of the following form: + +.. prompt:: bash $ + + ceph fs subvolumegroup ls .. note:: Subvolume group snapshot feature is no longer supported in mainline CephFS (existing group snapshots can still be listed and deleted) -Fetch the metadata of a subvolume group using:: +Fetch the metadata of a subvolume group by running a command of the following form: - $ ceph fs subvolumegroup info +.. prompt:: bash $ + + ceph fs subvolumegroup info The output format is JSON and contains fields as follows: @@ -194,62 +213,77 @@ The output format is JSON and contains fields as follows: * ``created_at``: creation time of the subvolume group in the format "YYYY-MM-DD HH:MM:SS" * ``data_pool``: data pool to which the subvolume group belongs -Check the presence of any subvolume group using:: +Check the presence of any subvolume group by running a command of the following form: - $ ceph fs subvolumegroup exist +.. prompt:: bash $ -The 'exist' command outputs: + ceph fs subvolumegroup exist + +The ``exist`` command outputs: * "subvolumegroup exists": if any subvolumegroup is present * "no subvolumegroup exists": if no subvolumegroup is present -.. note:: This command checks for the presence of custom groups and not presence of the default one. To validate the emptiness of the volume, a subvolumegroup existence check alone is not sufficient. Subvolume existence also needs to be checked as there might be subvolumes in the default group. +.. note:: This command checks for the presence of custom groups and not + presence of the default one. To validate the emptiness of the volume, a + subvolumegroup existence check alone is not sufficient. Subvolume existence + also needs to be checked as there might be subvolumes in the default group. -Resize a subvolume group using:: +Resize a subvolume group by running a command of the following form: - $ ceph fs subvolumegroup resize [--no_shrink] +.. prompt:: bash $ -The command resizes the subvolume group quota using the size specified by ``new_size``. -The ``--no_shrink`` flag prevents the subvolume group from shrinking below the current used -size. + ceph fs subvolumegroup resize [--no_shrink] -The subvolume group may be resized to an infinite size by passing ``inf`` or ``infinite`` -as the ``new_size``. +The command resizes the subvolume group quota, using the size specified by +``new_size``. The ``--no_shrink`` flag prevents the subvolume group from +shrinking below the current used size. -Remove a snapshot of a subvolume group using:: +The subvolume group may be resized to an infinite size by passing ``inf`` or +``infinite`` as the ``new_size``. - $ ceph fs subvolumegroup snapshot rm [--force] +Remove a snapshot of a subvolume group by running a command of the following form: + +.. prompt:: bash $ + + ceph fs subvolumegroup snapshot rm [--force] Supplying the ``--force`` flag allows the command to succeed when it would otherwise -fail due to the snapshot not existing. +fail due to the nonexistence of the snapshot. -List snapshots of a subvolume group using:: +List snapshots of a subvolume group by running a command of the following form: - $ ceph fs subvolumegroup snapshot ls +.. prompt:: bash $ + + ceph fs subvolumegroup snapshot ls FS Subvolumes ------------- -Create a subvolume using:: +Create a subvolume using: - $ ceph fs subvolume create [--size ] [--group_name ] [--pool_layout ] [--uid ] [--gid ] [--mode ] [--namespace-isolated] +.. prompt:: bash $ + + ceph fs subvolume create [--size ] [--group_name ] [--pool_layout ] [--uid ] [--gid ] [--mode ] [--namespace-isolated] The command succeeds even if the subvolume already exists. -When creating a subvolume you can specify its subvolume group, data pool layout, -uid, gid, file mode in octal numerals, and size in bytes. The size of the subvolume is -specified by setting a quota on it (see :doc:`/cephfs/quota`). The subvolume can be -created in a separate RADOS namespace by specifying --namespace-isolated option. By -default a subvolume is created within the default subvolume group, and with an octal file -mode '755', uid of its subvolume group, gid of its subvolume group, data pool layout of -its parent directory and no size limit. +When creating a subvolume you can specify its subvolume group, data pool +layout, uid, gid, file mode in octal numerals, and size in bytes. The size of +the subvolume is specified by setting a quota on it (see :doc:`/cephfs/quota`). +The subvolume can be created in a separate RADOS namespace by specifying +--namespace-isolated option. By default a subvolume is created within the +default subvolume group, and with an octal file mode '755', uid of its +subvolume group, gid of its subvolume group, data pool layout of its parent +directory and no size limit. -Remove a subvolume using:: +Remove a subvolume using: - $ ceph fs subvolume rm [--group_name ] [--force] [--retain-snapshots] +.. prompt:: bash $ + ceph fs subvolume rm [--group_name ] [--force] [--retain-snapshots] The command removes the subvolume and its contents. It does this in two steps. First, it moves the subvolume to a trash folder, and then asynchronously purges @@ -262,44 +296,62 @@ A subvolume can be removed retaining existing snapshots of the subvolume using t '--retain-snapshots' option. If snapshots are retained, the subvolume is considered empty for all operations not involving the retained snapshots. -.. note:: Snapshot retained subvolumes can be recreated using 'ceph fs subvolume create' +.. note:: Snapshot retained subvolumes can be recreated using 'ceph fs + subvolume create' -.. note:: Retained snapshots can be used as a clone source to recreate the subvolume, or clone to a newer subvolume. +.. note:: Retained snapshots can be used as a clone source to recreate the + subvolume, or clone to a newer subvolume. -Resize a subvolume using:: +Resize a subvolume using: - $ ceph fs subvolume resize [--group_name ] [--no_shrink] +.. prompt:: bash $ -The command resizes the subvolume quota using the size specified by ``new_size``. -The `--no_shrink`` flag prevents the subvolume from shrinking below the current used size of the subvolume. + ceph fs subvolume resize [--group_name ] [--no_shrink] -The subvolume can be resized to an unlimited (but sparse) logical size by passing ``inf`` or ``infinite`` as `` new_size``. +The command resizes the subvolume quota using the size specified by +``new_size``. The `--no_shrink`` flag prevents the subvolume from shrinking +below the current used size of the subvolume. -Authorize cephx auth IDs, the read/read-write access to fs subvolumes:: +The subvolume can be resized to an unlimited (but sparse) logical size by +passing ``inf`` or ``infinite`` as `` new_size``. - $ ceph fs subvolume authorize [--group_name=] [--access_level=] +Authorize cephx auth IDs, the read/read-write access to fs subvolumes: -The 'access_level' takes 'r' or 'rw' as value. +.. prompt:: bash $ -Deauthorize cephx auth IDs, the read/read-write access to fs subvolumes:: + ceph fs subvolume authorize [--group_name=] [--access_level=] - $ ceph fs subvolume deauthorize [--group_name=] +The ``access_level`` takes ``r`` or ``rw`` as value. -List cephx auth IDs authorized to access fs subvolume:: +Deauthorize cephx auth IDs, the read/read-write access to fs subvolumes: - $ ceph fs subvolume authorized_list [--group_name=] +.. prompt:: bash $ -Evict fs clients based on auth ID and subvolume mounted:: + ceph fs subvolume deauthorize [--group_name=] - $ ceph fs subvolume evict [--group_name=] +List cephx auth IDs authorized to access fs subvolume: -Fetch the absolute path of a subvolume using:: +.. prompt:: bash $ - $ ceph fs subvolume getpath [--group_name ] + ceph fs subvolume authorized_list [--group_name=] -Fetch the information of a subvolume using:: +Evict fs clients based on auth ID and subvolume mounted: - $ ceph fs subvolume info [--group_name ] +.. prompt:: bash $ + + ceph fs subvolume evict [--group_name=] + +Fetch the absolute path of a subvolume using: + +.. prompt:: bash $ + + ceph fs subvolume getpath [--group_name ] + +Fetch the information of a subvolume using: + +.. prompt:: bash $ + + ceph fs subvolume info [--group_name ] The output format is JSON and contains fields as follows. @@ -339,67 +391,93 @@ A subvolume's ``state`` is based on the current state of the subvolume and conta * ``complete``: subvolume is ready for all operations * ``snapshot-retained``: subvolume is removed but its snapshots are retained -List subvolumes using:: +List subvolumes using: - $ ceph fs subvolume ls [--group_name ] +.. prompt:: bash $ -.. note:: subvolumes that are removed but have snapshots retained, are also listed. + ceph fs subvolume ls [--group_name ] -Check the presence of any subvolume using:: +.. note:: subvolumes that are removed but have snapshots retained, are also + listed. - $ ceph fs subvolume exist [--group_name ] +Check the presence of any subvolume using: + +.. prompt:: bash $ + + ceph fs subvolume exist [--group_name ] These are the possible results of the ``exist`` command: * ``subvolume exists``: if any subvolume of given group_name is present * ``no subvolume exists``: if no subvolume of given group_name is present -Set custom metadata on the subvolume as a key-value pair using:: +Set custom metadata on the subvolume as a key-value pair using: - $ ceph fs subvolume metadata set [--group_name ] +.. prompt:: bash $ -.. note:: If the key_name already exists then the old value will get replaced by the new value. + ceph fs subvolume metadata set [--group_name ] -.. note:: key_name and value should be a string of ASCII characters (as specified in python's string.printable). key_name is case-insensitive and always stored in lower case. +.. note:: If the key_name already exists then the old value will get replaced + by the new value. -.. note:: Custom metadata on a subvolume is not preserved when snapshotting the subvolume, and hence, is also not preserved when cloning the subvolume snapshot. +.. note:: key_name and value should be a string of ASCII characters (as + specified in python's string.printable). key_name is case-insensitive and + always stored in lower case. -Get custom metadata set on the subvolume using the metadata key:: +.. note:: Custom metadata on a subvolume is not preserved when snapshotting the + subvolume, and hence, is also not preserved when cloning the subvolume + snapshot. - $ ceph fs subvolume metadata get [--group_name ] +Get custom metadata set on the subvolume using the metadata key: -List custom metadata (key-value pairs) set on the subvolume using:: +.. prompt:: bash $ - $ ceph fs subvolume metadata ls [--group_name ] + ceph fs subvolume metadata get [--group_name ] -Remove custom metadata set on the subvolume using the metadata key:: +List custom metadata (key-value pairs) set on the subvolume using: - $ ceph fs subvolume metadata rm [--group_name ] [--force] +.. prompt:: bash $ + + ceph fs subvolume metadata ls [--group_name ] + +Remove custom metadata set on the subvolume using the metadata key: + +.. prompt:: bash $ + + ceph fs subvolume metadata rm [--group_name ] [--force] Using the ``--force`` flag allows the command to succeed that would otherwise fail if the metadata key did not exist. -Create a snapshot of a subvolume using:: +Create a snapshot of a subvolume using: - $ ceph fs subvolume snapshot create [--group_name ] +.. prompt:: bash $ + ceph fs subvolume snapshot create [--group_name ] -Remove a snapshot of a subvolume using:: +Remove a snapshot of a subvolume using: - $ ceph fs subvolume snapshot rm [--group_name ] [--force] +.. prompt:: bash $ + + ceph fs subvolume snapshot rm [--group_name ] [--force] Using the ``--force`` flag allows the command to succeed that would otherwise fail if the snapshot did not exist. -.. note:: if the last snapshot within a snapshot retained subvolume is removed, the subvolume is also removed +.. note:: if the last snapshot within a snapshot retained subvolume is removed, + the subvolume is also removed -List snapshots of a subvolume using:: +List snapshots of a subvolume using: - $ ceph fs subvolume snapshot ls [--group_name ] +.. prompt:: bash $ -Fetch the information of a snapshot using:: + ceph fs subvolume snapshot ls [--group_name ] - $ ceph fs subvolume snapshot info [--group_name ] +Fetch the information of a snapshot using: + +.. prompt:: bash $ + + ceph fs subvolume snapshot info [--group_name ] The output format is JSON and contains fields as follows. @@ -440,27 +518,40 @@ Sample output when no snapshot clone is in progress or pending:: "has_pending_clones": "no" } -Set custom key-value metadata on the snapshot by running:: +Set custom key-value metadata on the snapshot by running: - $ ceph fs subvolume snapshot metadata set [--group_name ] +.. prompt:: bash $ -.. note:: If the key_name already exists then the old value will get replaced by the new value. + ceph fs subvolume snapshot metadata set [--group_name ] -.. note:: The key_name and value should be a strings of ASCII characters (as specified in Python's ``string.printable``). The key_name is case-insensitive and always stored in lowercase. +.. note:: If the key_name already exists then the old value will get replaced + by the new value. -.. note:: Custom metadata on a snapshot is not preserved when snapshotting the subvolume, and hence is also not preserved when cloning the subvolume snapshot. +.. note:: The key_name and value should be a strings of ASCII characters (as + specified in Python's ``string.printable``). The key_name is + case-insensitive and always stored in lowercase. -Get custom metadata set on the snapshot using the metadata key:: +.. note:: Custom metadata on a snapshot is not preserved when snapshotting the + subvolume, and hence is also not preserved when cloning the subvolume + snapshot. - $ ceph fs subvolume snapshot metadata get [--group_name ] +Get custom metadata set on the snapshot using the metadata key: -List custom metadata (key-value pairs) set on the snapshot using:: +.. prompt:: bash $ - $ ceph fs subvolume snapshot metadata ls [--group_name ] + ceph fs subvolume snapshot metadata get [--group_name ] -Remove custom metadata set on the snapshot using the metadata key:: +List custom metadata (key-value pairs) set on the snapshot using: - $ ceph fs subvolume snapshot metadata rm [--group_name ] [--force] +.. prompt:: bash $ + + ceph fs subvolume snapshot metadata ls [--group_name ] + +Remove custom metadata set on the snapshot using the metadata key: + +.. prompt:: bash $ + + ceph fs subvolume snapshot metadata rm [--group_name ] [--force] Using the ``--force`` flag allows the command to succeed that would otherwise fail if the metadata key did not exist. @@ -468,47 +559,73 @@ fail if the metadata key did not exist. Cloning Snapshots ----------------- -Subvolumes can be created by cloning subvolume snapshots. Cloning is an asynchronous operation that copies -data from a snapshot to a subvolume. Due to this bulk copying, cloning is inefficient for very large -data sets. +Subvolumes can be created by cloning subvolume snapshots. Cloning is an +asynchronous operation that copies data from a snapshot to a subvolume. Due to +this bulk copying, cloning is inefficient for very large data sets. -.. note:: Removing a snapshot (source subvolume) would fail if there are pending or in progress clone operations. +.. note:: Removing a snapshot (source subvolume) would fail if there are + pending or in progress clone operations. -Protecting snapshots prior to cloning was a prerequisite in the Nautilus release, and the commands to protect/unprotect -snapshots were introduced for this purpose. This prerequisite, and hence the commands to protect/unprotect, is being -deprecated and may be removed from a future release. +Protecting snapshots prior to cloning was a prerequisite in the Nautilus +release, and the commands to protect/unprotect snapshots were introduced for +this purpose. This prerequisite, and hence the commands to protect/unprotect, +is being deprecated and may be removed from a future release. -The commands being deprecated are:: - $ ceph fs subvolume snapshot protect [--group_name ] - $ ceph fs subvolume snapshot unprotect [--group_name ] +The commands being deprecated are: -.. note:: Using the above commands will not result in an error, but they have no useful purpose. +.. prompt:: bash # -.. note:: Use the ``subvolume info`` command to fetch subvolume metadata regarding supported ``features`` to help decide if protect/unprotect of snapshots is required, based on the availability of the ``snapshot-autoprotect`` feature. + ceph fs subvolume snapshot protect [--group_name ] + ceph fs subvolume snapshot unprotect [--group_name ] -To initiate a clone operation use:: +.. note:: Using the above commands will not result in an error, but they have + no useful purpose. - $ ceph fs subvolume snapshot clone +.. note:: Use the ``subvolume info`` command to fetch subvolume metadata + regarding supported ``features`` to help decide if protect/unprotect of + snapshots is required, based on the availability of the + ``snapshot-autoprotect`` feature. -If a snapshot (source subvolume) is a part of non-default group, the group name needs to be specified:: +To initiate a clone operation use: - $ ceph fs subvolume snapshot clone --group_name +.. prompt:: bash $ -Cloned subvolumes can be a part of a different group than the source snapshot (by default, cloned subvolumes are created in default group). To clone to a particular group use:: + ceph fs subvolume snapshot clone + +If a snapshot (source subvolume) is a part of non-default group, the group name +needs to be specified: + +.. prompt:: bash $ + + ceph fs subvolume snapshot clone --group_name + +Cloned subvolumes can be a part of a different group than the source snapshot +(by default, cloned subvolumes are created in default group). To clone to a +particular group use: + +.. prompt:: bash $ $ ceph fs subvolume snapshot clone --target_group_name -Similar to specifying a pool layout when creating a subvolume, pool layout can be specified when creating a cloned subvolume. To create a cloned subvolume with a specific pool layout use:: +Similar to specifying a pool layout when creating a subvolume, pool layout can +be specified when creating a cloned subvolume. To create a cloned subvolume +with a specific pool layout use: - $ ceph fs subvolume snapshot clone --pool_layout +.. prompt:: bash $ -Configure the maximum number of concurrent clones. The default is 4:: + ceph fs subvolume snapshot clone --pool_layout - $ ceph config set mgr mgr/volumes/max_concurrent_clones +Configure the maximum number of concurrent clones. The default is 4: -To check the status of a clone operation use:: +.. prompt:: bash $ - $ ceph fs clone status [--group_name ] + ceph config set mgr mgr/volumes/max_concurrent_clones + +To check the status of a clone operation use: + +.. prompt:: bash $ + + ceph fs clone status [--group_name ] A clone can be in one of the following states: @@ -538,7 +655,8 @@ Here is an example of an ``in-progress`` clone:: } } -.. note:: The ``failure`` section will be shown only if the clone's state is ``failed`` or ``cancelled`` +.. note:: The ``failure`` section will be shown only if the clone's state is + ``failed`` or ``cancelled`` Here is an example of a ``failed`` clone:: @@ -560,9 +678,11 @@ Here is an example of a ``failed`` clone:: } } -(NOTE: since ``subvol1`` is in the default group, the ``source`` object's ``clone status`` does not include the group name) +(NOTE: since ``subvol1`` is in the default group, the ``source`` object's +``clone status`` does not include the group name) -.. note:: Cloned subvolumes are accessible only after the clone operation has successfully completed. +.. note:: Cloned subvolumes are accessible only after the clone operation has + successfully completed. After a successful clone operation, ``clone status`` will look like the below:: @@ -576,37 +696,47 @@ After a successful clone operation, ``clone status`` will look like the below:: If a clone operation is unsuccessful, the ``state`` value will be ``failed``. To retry a failed clone operation, the incomplete clone must be deleted and the -clone operation must be issued again. To delete a partial clone use:: +clone operation must be issued again. To delete a partial clone use: - $ ceph fs subvolume rm [--group_name ] --force +.. prompt:: bash $ + + ceph fs subvolume rm [--group_name ] --force .. note:: Cloning synchronizes only directories, regular files and symbolic links. Inode timestamps (access and modification times) are synchronized up to seconds granularity. An ``in-progress`` or a ``pending`` clone operation may be canceled. To cancel -a clone operation use the ``clone cancel`` command:: +a clone operation use the ``clone cancel`` command: - $ ceph fs clone cancel [--group_name ] +.. prompt:: bash $ -On successful cancellation, the cloned subvolume is moved to the ``canceled`` -state:: + ceph fs clone cancel [--group_name ] - $ ceph fs subvolume snapshot clone cephfs subvol1 snap1 clone1 - $ ceph fs clone cancel cephfs clone1 - $ ceph fs clone status cephfs clone1 - { - "status": { - "state": "canceled", - "source": { - "volume": "cephfs", - "subvolume": "subvol1", - "snapshot": "snap1" - } +On successful cancellation, the cloned subvolume is moved to the ``canceled`` state: + +.. prompt:: bash # + + ceph fs subvolume snapshot clone cephfs subvol1 snap1 clone1 + ceph fs clone cancel cephfs clone1 + ceph fs clone status cephfs clone1 + +:: + + { + "status": { + "state": "canceled", + "source": { + "volume": "cephfs", + "subvolume": "subvol1", + "snapshot": "snap1" + } + } } } -.. note:: The canceled cloned may be deleted by supplying the ``--force`` option to the `fs subvolume rm` command. +.. note:: The canceled cloned may be deleted by supplying the ``--force`` + option to the `fs subvolume rm` command. .. _subvol-pinning: @@ -614,28 +744,33 @@ state:: Pinning Subvolumes and Subvolume Groups --------------------------------------- - Subvolumes and subvolume groups may be automatically pinned to ranks according to policies. This can distribute load across MDS ranks in predictable and stable ways. Review :ref:`cephfs-pinning` and :ref:`cephfs-ephemeral-pinning` for details on how pinning works. -Pinning is configured by:: +Pinning is configured by: - $ ceph fs subvolumegroup pin +.. prompt:: bash $ -or for subvolumes:: + ceph fs subvolumegroup pin - $ ceph fs subvolume pin +or for subvolumes: + +.. prompt:: bash $ + + ceph fs subvolume pin Typically you will want to set subvolume group pins. The ``pin_type`` may be one of ``export``, ``distributed``, or ``random``. The ``pin_setting`` corresponds to the extended attributed "value" as in the pinning documentation referenced above. -So, for example, setting a distributed pinning strategy on a subvolume group:: +So, for example, setting a distributed pinning strategy on a subvolume group: - $ ceph fs subvolumegroup pin cephfilesystem-a csi distributed 1 +.. prompt:: bash $ + + ceph fs subvolumegroup pin cephfilesystem-a csi distributed 1 Will enable distributed subtree partitioning policy for the "csi" subvolume group. This will cause every subvolume within the group to be automatically diff --git a/ceph/doc/cephfs/health-messages.rst b/ceph/doc/cephfs/health-messages.rst index 28ceb704a..bb461be7b 100644 --- a/ceph/doc/cephfs/health-messages.rst +++ b/ceph/doc/cephfs/health-messages.rst @@ -123,7 +123,9 @@ other daemons, please see :ref:`health-checks`. from properly cleaning up resources used by client requests. This message appears if a client appears to have more than ``max_completed_requests`` (default 100000) requests that are complete on the MDS side but haven't - yet been accounted for in the client's *oldest tid* value. + yet been accounted for in the client's *oldest tid* value. The last tid + used by the MDS to trim completed client requests (or flush) is included + as part of `session ls` (or `client ls`) command as a debug aid. * ``MDS_DAMAGE`` Message @@ -168,3 +170,15 @@ other daemons, please see :ref:`health-checks`. the actual cache size (in memory) is at least 50% greater than ``mds_cache_memory_limit`` (default 1GB). Modify ``mds_health_cache_threshold`` to set the warning ratio. + +* ``MDS_CLIENTS_LAGGY`` + + Message + "Client *ID* is laggy; not evicted because some OSD(s) is/are laggy" + + Description + If OSD(s) is laggy (due to certain conditions like network cut-off, etc) + then it might make clients laggy(session might get idle or cannot flush + dirty data for cap revokes). If ``defer_client_eviction_on_laggy_osds`` is + set to true (default true), client eviction will not take place and thus + this health warning will be generated. diff --git a/ceph/doc/cephfs/mds-config-ref.rst b/ceph/doc/cephfs/mds-config-ref.rst index 2efc83b41..2b22a844f 100644 --- a/ceph/doc/cephfs/mds-config-ref.rst +++ b/ceph/doc/cephfs/mds-config-ref.rst @@ -501,6 +501,25 @@ :Type: 32-bit Integer :Default: ``0`` +``mds_inject_skip_replaying_inotable`` + +:Description: Ceph will skip replaying the inotable when replaying the journal, + and the premary MDS will crash, while the replacing MDS won't. + (for developers only). + +:Type: Boolean +:Default: ``false`` + + +``mds_kill_skip_replaying_inotable`` + +:Description: Ceph will skip replaying the inotable when replaying the journal, + and the premary MDS will crash, while the replacing MDS won't. + (for developers only). + +:Type: Boolean +:Default: ``false`` + ``mds_wipe_sessions`` diff --git a/ceph/doc/cephfs/mount-using-fuse.rst b/ceph/doc/cephfs/mount-using-fuse.rst index 27768f503..bd098dc91 100644 --- a/ceph/doc/cephfs/mount-using-fuse.rst +++ b/ceph/doc/cephfs/mount-using-fuse.rst @@ -53,7 +53,8 @@ If you have more than one FS on your Ceph cluster, use the option ceph-fuse --id foo --client_fs mycephfs2 /mnt/mycephfs2 -You may also add a ``client_fs`` setting to your ``ceph.conf`` +You may also add a ``client_fs`` setting to your ``ceph.conf``. Alternatively, the option +``--client_mds_namespace`` is supported for backward compatibility. Unmounting CephFS ================= diff --git a/ceph/doc/cephfs/mount-using-kernel-driver.rst b/ceph/doc/cephfs/mount-using-kernel-driver.rst index 5bd98dd51..4aaf480df 100644 --- a/ceph/doc/cephfs/mount-using-kernel-driver.rst +++ b/ceph/doc/cephfs/mount-using-kernel-driver.rst @@ -96,6 +96,28 @@ non-default FS as follows:: mount -t ceph :/ /mnt/mycephfs2 -o name=fs,fs=mycephfs2 +Backward Compatibility +====================== +The old syntax is supported for backward compatibility. + +To mount CephFS with the kernel driver:: + + mkdir /mnt/mycephfs + mount -t ceph :/ /mnt/mycephfs -o name=admin + +The key-value argument right after option ``-o`` is CephX credential; +``name`` is the username of the CephX user we are using to mount CephFS. + +To mount a non-default FS ``cephfs2``, in case the cluster has multiple FSs:: + + mount -t ceph :/ /mnt/mycephfs -o name=admin,fs=cephfs2 + + or + + mount -t ceph :/ /mnt/mycephfs -o name=admin,mds_namespace=cephfs2 + +.. note:: The option ``mds_namespace`` is deprecated. Use ``fs=`` instead when using the old syntax for mounting. + Unmounting CephFS ================= To unmount the Ceph file system, use the ``umount`` command as usual:: diff --git a/ceph/doc/cephfs/nfs.rst b/ceph/doc/cephfs/nfs.rst index 6c44b8650..f06911d5c 100644 --- a/ceph/doc/cephfs/nfs.rst +++ b/ceph/doc/cephfs/nfs.rst @@ -60,6 +60,18 @@ added as comments in the sample conf. There are options to do the following: - enable read delegations (need at least v13.0.1 'libcephfs2' package and v2.6.0 stable 'nfs-ganesha' and 'nfs-ganesha-ceph' packages) +.. important:: + + Under certain conditions, NFS access using the CephFS FSAL fails. This + causes an error to be thrown that reads "Input/output error". Under these + circumstances, the application metadata must be set for the CephFS metadata + and CephFS data pools. Do this by running the following command: + + .. prompt:: bash $ + + ceph osd pool application set cephfs cephfs + + Configuration for libcephfs clients ----------------------------------- diff --git a/ceph/doc/cephfs/scrub.rst b/ceph/doc/cephfs/scrub.rst index 114f7580a..5b813f1c4 100644 --- a/ceph/doc/cephfs/scrub.rst +++ b/ceph/doc/cephfs/scrub.rst @@ -143,3 +143,14 @@ The types of damage that can be reported and repaired by File System Scrub are: * BACKTRACE : Inode's backtrace in the data pool is corrupted. +Evaluate strays using recursive scrub +===================================== + +- In order to evaluate strays i.e. purge stray directories in ``~mdsdir`` use the following command:: + + ceph tell mds.:0 scrub start ~mdsdir recursive + +- ``~mdsdir`` is not enqueued by default when scrubbing at the CephFS root. In order to perform stray evaluation + at root, run scrub with flags ``scrub_mdsdir`` and ``recursive``:: + + ceph tell mds.:0 scrub start / recursive,scrub_mdsdir diff --git a/ceph/doc/cephfs/snap-schedule.rst b/ceph/doc/cephfs/snap-schedule.rst index 2728203f4..fb9c85b5a 100644 --- a/ceph/doc/cephfs/snap-schedule.rst +++ b/ceph/doc/cephfs/snap-schedule.rst @@ -142,6 +142,19 @@ Examples:: ceph fs snap-schedule retention add / 24h4w # add 24 hourly and 4 weekly to retention ceph fs snap-schedule retention remove / 7d4w # remove 7 daily and 4 weekly, leaves 24 hourly +.. note: When adding a path to snap-schedule, remember to strip off the mount + point path prefix. Paths to snap-schedule should start at the appropriate + CephFS file system root and not at the host file system root. + e.g. if the Ceph File System is mounted at ``/mnt`` and the path under which + snapshots need to be taken is ``/mnt/some/path`` then the acutal path required + by snap-schedule is only ``/some/path``. + +.. note: It should be noted that the "created" field in the snap-schedule status + command output is the timestamp at which the schedule was created. The "created" + timestamp has nothing to do with the creation of actual snapshots. The actual + snapshot creation is accounted for in the "created_count" field, which is a + cumulative count of the total number of snapshots created so far. + Active and inactive schedules ----------------------------- Snapshot schedules can be added for a path that doesn't exist yet in the diff --git a/ceph/doc/cephfs/troubleshooting.rst b/ceph/doc/cephfs/troubleshooting.rst index 78ad18dde..60de0c1a3 100644 --- a/ceph/doc/cephfs/troubleshooting.rst +++ b/ceph/doc/cephfs/troubleshooting.rst @@ -188,6 +188,98 @@ You can enable dynamic debug against the CephFS module. Please see: https://github.com/ceph/ceph/blob/master/src/script/kcon_all.sh +In-memory Log Dump +================== + +In-memory logs can be dumped by setting ``mds_extraordinary_events_dump_interval`` +during a lower level debugging (log level < 10). ``mds_extraordinary_events_dump_interval`` +is the interval in seconds for dumping the recent in-memory logs when there is an Extra-Ordinary event. + +The Extra-Ordinary events are classified as: + +* Client Eviction +* Missed Beacon ACK from the monitors +* Missed Internal Heartbeats + +In-memory Log Dump is disabled by default to prevent log file bloat in a production environment. +The below commands consecutively enables it:: + + $ ceph config set mds debug_mds / + $ ceph config set mds mds_extraordinary_events_dump_interval + +The ``log_level`` should be < 10 and ``gather_level`` should be >= 10 to enable in-memory log dump. +When it is enabled, the MDS checks for the extra-ordinary events every +``mds_extraordinary_events_dump_interval`` seconds and if any of them occurs, MDS dumps the +in-memory logs containing the relevant event details in ceph-mds log. + +.. note:: For higher log levels (log_level >= 10) there is no reason to dump the In-memory Logs and a + lower gather level (gather_level < 10) is insufficient to gather In-memory Logs. Thus a + log level >=10 or a gather level < 10 in debug_mds would prevent enabling the In-memory Log Dump. + In such cases, when there is a failure it's required to reset the value of + mds_extraordinary_events_dump_interval to 0 before enabling using the above commands. + +The In-memory Log Dump can be disabled using:: + + $ ceph config set mds mds_extraordinary_events_dump_interval 0 + +Filesystems Become Inaccessible After an Upgrade +================================================ + +.. note:: + You can avoid ``operation not permitted`` errors by running this procedure + before an upgrade. As of May 2023, it seems that ``operation not permitted`` + errors of the kind discussed here occur after upgrades after Nautilus + (inclusive). + +IF + +you have CephFS file systems that have data and metadata pools that were +created by a ``ceph fs new`` command (meaning that they were not created +with the defaults) + +OR + +you have an existing CephFS file system and are upgrading to a new post-Nautilus +major version of Ceph + +THEN + +in order for the documented ``ceph fs authorize...`` commands to function as +documented (and to avoid 'operation not permitted' errors when doing file I/O +or similar security-related problems for all users except the ``client.admin`` +user), you must first run: + +.. prompt:: bash $ + + ceph osd pool application set cephfs metadata + +and + +.. prompt:: bash $ + + ceph osd pool application set cephfs data + +Otherwise, when the OSDs receive a request to read or write data (not the +directory info, but file data) they will not know which Ceph file system name +to look up. This is true also of pool names, because the 'defaults' themselves +changed in the major releases, from:: + + data pool=fsname + metadata pool=fsname_metadata + +to:: + + data pool=fsname.data and + metadata pool=fsname.meta + +Any setup that used ``client.admin`` for all mounts did not run into this +problem, because the admin key gave blanket permissions. + +A temporary fix involves changing mount requests to the 'client.admin' user and +its associated key. A less drastic but half-fix is to change the osd cap for +your user to just ``caps osd = "allow rw"`` and delete ``tag cephfs +data=....`` + Reporting Issues ================ diff --git a/ceph/doc/dev/network-encoding.rst b/ceph/doc/dev/network-encoding.rst index d59b0ee9e..d5d1a6d15 100644 --- a/ceph/doc/dev/network-encoding.rst +++ b/ceph/doc/dev/network-encoding.rst @@ -87,7 +87,8 @@ Optionals are represented as a presence byte, followed by the item if it exists. T element[present? 1 : 0]; // Only if present is non-zero. } -Optionals are used to encode ``boost::optional``. +Optionals are used to encode ``boost::optional`` and, since introducing +C++17 to Ceph, ``std::optional``. Pair ---- diff --git a/ceph/doc/dev/osd_internals/erasure_coding/jerasure.rst b/ceph/doc/dev/osd_internals/erasure_coding/jerasure.rst index 27669a0b2..ac3636720 100644 --- a/ceph/doc/dev/osd_internals/erasure_coding/jerasure.rst +++ b/ceph/doc/dev/osd_internals/erasure_coding/jerasure.rst @@ -5,7 +5,7 @@ jerasure plugin Introduction ------------ -The parameters interpreted by the jerasure plugin are: +The parameters interpreted by the ``jerasure`` plugin are: :: @@ -31,3 +31,5 @@ upstream repositories `http://jerasure.org/jerasure/jerasure `http://jerasure.org/jerasure/gf-complete `_ . The difference between the two, if any, should match pull requests against upstream. +Note that as of 2023, the ``jerasure.org`` web site may no longer be +legitimate and/or associated with the original project. diff --git a/ceph/doc/dev/osd_internals/past_intervals.rst b/ceph/doc/dev/osd_internals/past_intervals.rst new file mode 100644 index 000000000..5b594df1a --- /dev/null +++ b/ceph/doc/dev/osd_internals/past_intervals.rst @@ -0,0 +1,93 @@ +============= +PastIntervals +============= + +Purpose +------- + +There are two situations where we need to consider the set of all acting-set +OSDs for a PG back to some epoch ``e``: + + * During peering, we need to consider the acting set for every epoch back to + ``last_epoch_started``, the last epoch in which the PG completed peering and + became active. + (see :doc:`/dev/osd_internals/last_epoch_started` for a detailed explanation) + * During recovery, we need to consider the acting set for every epoch back to + ``last_epoch_clean``, the last epoch at which all of the OSDs in the acting + set were fully recovered, and the acting set was full. + +For either of these purposes, we could build such a set by iterating backwards +from the current OSDMap to the relevant epoch. Instead, we maintain a structure +PastIntervals for each PG. + +An ``interval`` is a contiguous sequence of OSDMap epochs where the PG mapping +didn't change. This includes changes to the acting set, the up set, the +primary, and several other parameters fully spelled out in +PastIntervals::check_new_interval. + +Maintenance and Trimming +------------------------ + +The PastIntervals structure stores a record for each ``interval`` back to +last_epoch_clean. On each new ``interval`` (See AdvMap reactions, +PeeringState::should_restart_peering, and PeeringState::start_peering_interval) +each OSD with the PG will add the new ``interval`` to its local PastIntervals. +Activation messages to OSDs which do not already have the PG contain the +sender's PastIntervals so that the recipient needn't rebuild it. (See +PeeringState::activate needs_past_intervals). + +PastIntervals are trimmed in two places. First, when the primary marks the +PG clean, it clears its past_intervals instance +(PeeringState::try_mark_clean()). The replicas will do the same thing when +they receive the info (See PeeringState::update_history). + +The second, more complex, case is in PeeringState::start_peering_interval. In +the event of a "map gap", we assume that the PG actually has gone clean, but we +haven't received a pg_info_t with the updated ``last_epoch_clean`` value yet. +To explain this behavior, we need to discuss OSDMap trimming. + +OSDMap Trimming +--------------- + +OSDMaps are created by the Monitor quorum and gossiped out to the OSDs. The +Monitor cluster also determines when OSDs (and the Monitors) are allowed to +trim old OSDMap epochs. For the reasons explained above in this document, the +primary constraint is that we must retain all OSDMaps back to some epoch such +that all PGs have been clean at that or a later epoch (min_last_epoch_clean). +(See OSDMonitor::get_trim_to). + +The Monitor quorum determines min_last_epoch_clean through MOSDBeacon messages +sent periodically by each OSDs. Each message contains a set of PGs for which +the OSD is primary at that moment as well as the min_last_epoch_clean across +that set. The Monitors track these values in OSDMonitor::last_epoch_clean. + +There is a subtlety in the min_last_epoch_clean value used by the OSD to +populate the MOSDBeacon. OSD::collect_pg_stats invokes PG::with_pg_stats to +obtain the lec value, which actually uses +pg_stat_t::get_effective_last_epoch_clean() rather than +info.history.last_epoch_clean. If the PG is currently clean, +pg_stat_t::get_effective_last_epoch_clean() is the current epoch rather than +last_epoch_clean -- this works because the PG is clean at that epoch and it +allows OSDMaps to be trimmed during periods where OSDMaps are being created +(due to snapshot activity, perhaps), but no PGs are undergoing ``interval`` +changes. + +Back to PastIntervals +--------------------- + +We can now understand our second trimming case above. If OSDMaps have been +trimmed up to epoch ``e``, we know that the PG must have been clean at some epoch +>= ``e`` (indeed, **all** PGs must have been), so we can drop our PastIntevals. + +This dependency also pops up in PeeringState::check_past_interval_bounds(). +PeeringState::get_required_past_interval_bounds takes as a parameter +oldest_epoch, which comes from OSDSuperblock::cluster_osdmap_trim_lower_bound. +We use cluster_osdmap_trim_lower_bound rather than a specific osd's oldest_map +because we don't necessarily trim all MOSDMap::cluster_osdmap_trim_lower_bound. +In order to avoid doing too much work at once we limit the amount of osdmaps +trimmed using ``osd_target_transaction_size`` in OSD::trim_maps(). +For this reason, a specific OSD's oldest_map can lag behind +OSDSuperblock::cluster_osdmap_trim_lower_bound +for a while. + +See https://tracker.ceph.com/issues/49689 for an example. diff --git a/ceph/doc/glossary.rst b/ceph/doc/glossary.rst index b87818194..0ccbd37af 100644 --- a/ceph/doc/glossary.rst +++ b/ceph/doc/glossary.rst @@ -12,12 +12,13 @@ :ref:`BlueStore` OSD BlueStore is a storage back end used by OSD daemons, and was designed specifically for use with Ceph. BlueStore was - introduced in the Ceph Kraken release. In the Ceph Luminous - release, BlueStore became Ceph's default storage back end, - supplanting FileStore. Unlike :term:`filestore`, BlueStore - stores objects directly on Ceph block devices without any file - system interface. Since Luminous (12.2), BlueStore has been - Ceph's default and recommended storage back end. + introduced in the Ceph Kraken release. The Luminous release of + Ceph promoted BlueStore to the default OSD back end, + supplanting FileStore. As of the Reef release, FileStore is no + longer available as a storage backend. + + BlueStore stores objects directly on Ceph block devices without + a mounted file system. Bucket In the context of :term:`RGW`, a bucket is a group of objects. diff --git a/ceph/doc/index.rst b/ceph/doc/index.rst index 98c1be894..bda03d8ed 100644 --- a/ceph/doc/index.rst +++ b/ceph/doc/index.rst @@ -11,6 +11,12 @@ Ceph delivers **object, block, and file storage in one unified system**. Ceph project. (Click anywhere in this paragraph to read the "Basic Workflow" page of the Ceph Developer Guide.) `. +.. note:: + + :ref:`If you want to make a commit to the documentation but you don't + know how to get started, read the "Documenting Ceph" page. (Click anywhere + in this paragraph to read the "Documenting Ceph" page.) `. + .. raw:: html diff --git a/ceph/doc/man/8/cephfs-top.rst b/ceph/doc/man/8/cephfs-top.rst index c3719cd36..fd18ada77 100644 --- a/ceph/doc/man/8/cephfs-top.rst +++ b/ceph/doc/man/8/cephfs-top.rst @@ -36,6 +36,22 @@ Options Perform a selftest. This mode performs a sanity check of ``stats`` module. +.. option:: --conffile [CONFFILE] + + Path to cluster configuration file + +.. option:: -d [DELAY], --delay [DELAY] + + Refresh interval in seconds (default: 1) + +.. option:: --dump + + Dump the metrics to stdout + +.. option:: --dumpfs + + Dump the metrics of the given filesystem to stdout + Descriptions of fields ====================== diff --git a/ceph/doc/man/8/mount.ceph.rst b/ceph/doc/man/8/mount.ceph.rst index 1c67a12f8..41cbcaea2 100644 --- a/ceph/doc/man/8/mount.ceph.rst +++ b/ceph/doc/man/8/mount.ceph.rst @@ -110,6 +110,12 @@ Basic them. If an inode contains any stale file locks, read/write on the inode is not allowed until applications release all stale file locks. +:command: `fs=` + Specify the non-default file system to be mounted, when using the old syntax. + +:command: `mds_namespace=` + A synonym of "fs=" (Deprecated). + Advanced -------- :command:`cap_release_safety` @@ -236,6 +242,10 @@ history:: mount.ceph :/ /mnt/mycephfs -o name=fs_username,secretfile=/etc/ceph/fs_username.secret +To mount using the old syntax:: + + mount -t ceph 192.168.0.1:/ /mnt/mycephfs + Availability ============ diff --git a/ceph/doc/mgr/prometheus.rst b/ceph/doc/mgr/prometheus.rst index e7e00d958..13b8ff5cc 100644 --- a/ceph/doc/mgr/prometheus.rst +++ b/ceph/doc/mgr/prometheus.rst @@ -18,9 +18,11 @@ for all reporting entities are returned in text exposition format. Enabling prometheus output ========================== -The *prometheus* module is enabled with:: +The *prometheus* module is enabled with: - ceph mgr module enable prometheus +.. prompt:: bash $ + + ceph mgr module enable prometheus Configuration ------------- @@ -36,10 +38,10 @@ configurable with ``ceph config set``, with keys is registered with Prometheus's `registry `_. -:: - - ceph config set mgr mgr/prometheus/server_addr 0.0.0.0 - ceph config set mgr mgr/prometheus/server_port 9283 +.. prompt:: bash $ + + ceph config set mgr mgr/prometheus/server_addr 0.0.0. + ceph config set mgr mgr/prometheus/server_port 9283 .. warning:: @@ -54,9 +56,11 @@ recommended to use 15 seconds as scrape interval, though, in some cases it might be useful to increase the scrape interval. To set a different scrape interval in the Prometheus module, set -``scrape_interval`` to the desired value:: +``scrape_interval`` to the desired value: - ceph config set mgr mgr/prometheus/scrape_interval 20 +.. prompt:: bash $ + + ceph config set mgr mgr/prometheus/scrape_interval 20 On large clusters (>1000 OSDs), the time to fetch the metrics may become significant. Without the cache, the Prometheus manager module could, especially @@ -75,35 +79,47 @@ This behavior can be configured. By default, it will return a 503 HTTP status code (service unavailable). You can set other options using the ``ceph config set`` commands. -To tell the module to respond with possibly stale data, set it to ``return``:: +To tell the module to respond with possibly stale data, set it to ``return``: + +.. prompt:: bash $ ceph config set mgr mgr/prometheus/stale_cache_strategy return -To tell the module to respond with "service unavailable", set it to ``fail``:: +To tell the module to respond with "service unavailable", set it to ``fail``: - ceph config set mgr mgr/prometheus/stale_cache_strategy fail +.. prompt:: bash $ -If you are confident that you don't require the cache, you can disable it:: + ceph config set mgr mgr/prometheus/stale_cache_strategy fail - ceph config set mgr mgr/prometheus/cache false +If you are confident that you don't require the cache, you can disable it: + +.. prompt:: bash $ + + ceph config set mgr mgr/prometheus/cache false If you are using the prometheus module behind some kind of reverse proxy or loadbalancer, you can simplify discovering the active instance by switching -to ``error``-mode:: +to ``error``-mode: - ceph config set mgr mgr/prometheus/standby_behaviour error +.. prompt:: bash $ + + ceph config set mgr mgr/prometheus/standby_behaviour error If set, the prometheus module will repond with a HTTP error when requesting ``/`` from the standby instance. The default error code is 500, but you can configure -the HTTP response code with:: +the HTTP response code with: - ceph config set mgr mgr/prometheus/standby_error_status_code 503 +.. prompt:: bash $ + + ceph config set mgr mgr/prometheus/standby_error_status_code 503 Valid error codes are between 400-599. -To switch back to the default behaviour, simply set the config key to ``default``:: +To switch back to the default behaviour, simply set the config key to ``default``: - ceph config set mgr mgr/prometheus/standby_behaviour default +.. prompt:: bash $ + + ceph config set mgr mgr/prometheus/standby_behaviour default .. _prometheus-rbd-io-statistics: @@ -154,9 +170,17 @@ configuration parameter. The parameter is a comma or space separated list of ``pool[/namespace]`` entries. If the namespace is not specified the statistics are collected for all namespaces in the pool. -Example to activate the RBD-enabled pools ``pool1``, ``pool2`` and ``poolN``:: +Example to activate the RBD-enabled pools ``pool1``, ``pool2`` and ``poolN``: - ceph config set mgr mgr/prometheus/rbd_stats_pools "pool1,pool2,poolN" +.. prompt:: bash $ + + ceph config set mgr mgr/prometheus/rbd_stats_pools "pool1,pool2,poolN" + +The wildcard can be used to indicate all pools or namespaces: + +.. prompt:: bash $ + + ceph config set mgr mgr/prometheus/rbd_stats_pools "*" The module makes the list of all available images scanning the specified pools and namespaces and refreshes it periodically. The period is @@ -165,9 +189,22 @@ parameter (in sec) and is 300 sec (5 minutes) by default. The module will force refresh earlier if it detects statistics from a previously unknown RBD image. -Example to turn up the sync interval to 10 minutes:: +Example to turn up the sync interval to 10 minutes: - ceph config set mgr mgr/prometheus/rbd_stats_pools_refresh_interval 600 +.. prompt:: bash $ + + ceph config set mgr mgr/prometheus/rbd_stats_pools_refresh_interval 600 + +Ceph daemon performance counters metrics +----------------------------------------- + +With the introduction of ``ceph-exporter`` daemon, the prometheus module will no longer export Ceph daemon +perf counters as prometheus metrics by default. However, one may re-enable exporting these metrics by setting +the module option ``exclude_perf_counters`` to ``false``: + +.. prompt:: bash $ + + ceph config set mgr mgr/prometheus/exclude_perf_counters false Statistic names and labels ========================== diff --git a/ceph/doc/mgr/telemetry.rst b/ceph/doc/mgr/telemetry.rst index 6eaaa5c44..a2479c3f3 100644 --- a/ceph/doc/mgr/telemetry.rst +++ b/ceph/doc/mgr/telemetry.rst @@ -153,3 +153,24 @@ completely optional, and disabled by default.:: ceph config set mgr mgr/telemetry/description 'My first Ceph cluster' ceph config set mgr mgr/telemetry/channel_ident true +Leaderboard +----------- + +To participate in a leaderboard in the `public dashboards +`_, run the following command: + +.. prompt:: bash $ + + ceph config set mgr mgr/telemetry/leaderboard true + +The leaderboard displays basic information about the cluster. This includes the +total storage capacity and the number of OSDs. To add a description of the +cluster, run a command of the following form: + +.. prompt:: bash $ + + ceph config set mgr mgr/telemetry/leaderboard_description 'Ceph cluster for Computational Biology at the University of XYZ' + +If the ``ident`` channel is enabled, its details will not be displayed in the +leaderboard. + diff --git a/ceph/doc/rados/configuration/bluestore-config-ref.rst b/ceph/doc/rados/configuration/bluestore-config-ref.rst index 3bfc8e295..6ad316050 100644 --- a/ceph/doc/rados/configuration/bluestore-config-ref.rst +++ b/ceph/doc/rados/configuration/bluestore-config-ref.rst @@ -1,84 +1,95 @@ -========================== -BlueStore Config Reference -========================== +================================== + BlueStore Configuration Reference +================================== Devices ======= -BlueStore manages either one, two, or (in certain cases) three storage -devices. +BlueStore manages either one, two, or in certain cases three storage devices. +These *devices* are "devices" in the Linux/Unix sense. This means that they are +assets listed under ``/dev`` or ``/devices``. Each of these devices may be an +entire storage drive, or a partition of a storage drive, or a logical volume. +BlueStore does not create or mount a conventional file system on devices that +it uses; BlueStore reads and writes to the devices directly in a "raw" fashion. -In the simplest case, BlueStore consumes a single (primary) storage device. -The storage device is normally used as a whole, occupying the full device that -is managed directly by BlueStore. This *primary device* is normally identified -by a ``block`` symlink in the data directory. +In the simplest case, BlueStore consumes all of a single storage device. This +device is known as the *primary device*. The primary device is identified by +the ``block`` symlink in the data directory. -The data directory is a ``tmpfs`` mount which gets populated (at boot time, or -when ``ceph-volume`` activates it) with all the common OSD files that hold -information about the OSD, like: its identifier, which cluster it belongs to, -and its private keyring. +The data directory is a ``tmpfs`` mount. When this data directory is booted or +activated by ``ceph-volume``, it is populated with metadata files and links +that hold information about the OSD: for example, the OSD's identifier, the +name of the cluster that the OSD belongs to, and the OSD's private keyring. -It is also possible to deploy BlueStore across one or two additional devices: +In more complicated cases, BlueStore is deployed across one or two additional +devices: -* A *write-ahead log (WAL) device* (identified as ``block.wal`` in the data directory) can be - used for BlueStore's internal journal or write-ahead log. It is only useful - to use a WAL device if the device is faster than the primary device (e.g., - when it is on an SSD and the primary device is an HDD). +* A *write-ahead log (WAL) device* (identified as ``block.wal`` in the data + directory) can be used to separate out BlueStore's internal journal or + write-ahead log. Using a WAL device is advantageous only if the WAL device + is faster than the primary device (for example, if the WAL device is an SSD + and the primary device is an HDD). * A *DB device* (identified as ``block.db`` in the data directory) can be used - for storing BlueStore's internal metadata. BlueStore (or rather, the - embedded RocksDB) will put as much metadata as it can on the DB device to - improve performance. If the DB device fills up, metadata will spill back - onto the primary device (where it would have been otherwise). Again, it is - only helpful to provision a DB device if it is faster than the primary - device. + to store BlueStore's internal metadata. BlueStore (or more precisely, the + embedded RocksDB) will put as much metadata as it can on the DB device in + order to improve performance. If the DB device becomes full, metadata will + spill back onto the primary device (where it would have been located in the + absence of the DB device). Again, it is advantageous to provision a DB device + only if it is faster than the primary device. -If there is only a small amount of fast storage available (e.g., less -than a gigabyte), we recommend using it as a WAL device. If there is -more, provisioning a DB device makes more sense. The BlueStore -journal will always be placed on the fastest device available, so -using a DB device will provide the same benefit that the WAL device -would while *also* allowing additional metadata to be stored there (if -it will fit). This means that if a DB device is specified but an explicit -WAL device is not, the WAL will be implicitly colocated with the DB on the faster -device. +If there is only a small amount of fast storage available (for example, less +than a gigabyte), we recommend using the available space as a WAL device. But +if more fast storage is available, it makes more sense to provision a DB +device. Because the BlueStore journal is always placed on the fastest device +available, using a DB device provides the same benefit that using a WAL device +would, while *also* allowing additional metadata to be stored off the primary +device (provided that it fits). DB devices make this possible because whenever +a DB device is specified but an explicit WAL device is not, the WAL will be +implicitly colocated with the DB on the faster device. -A single-device (colocated) BlueStore OSD can be provisioned with: +To provision a single-device (colocated) BlueStore OSD, run the following +command: .. prompt:: bash $ ceph-volume lvm prepare --bluestore --data -To specify a WAL device and/or DB device: +To specify a WAL device or DB device, run the following command: .. prompt:: bash $ ceph-volume lvm prepare --bluestore --data --block.wal --block.db -.. note:: ``--data`` can be a Logical Volume using *vg/lv* notation. Other - devices can be existing logical volumes or GPT partitions. +.. note:: The option ``--data`` can take as its argument any of the the + following devices: logical volumes specified using *vg/lv* notation, + existing logical volumes, and GPT partitions. + + Provisioning strategies ----------------------- -Although there are multiple ways to deploy a BlueStore OSD (unlike Filestore -which had just one), there are two common arrangements that should help clarify -the deployment strategy: + +BlueStore differs from Filestore in that there are several ways to deploy a +BlueStore OSD. However, the overall deployment strategy for BlueStore can be +clarified by examining just these two common arrangements: .. _bluestore-single-type-device-config: **block (data) only** ^^^^^^^^^^^^^^^^^^^^^ -If all devices are the same type, for example all rotational drives, and -there are no fast devices to use for metadata, it makes sense to specify the -block device only and to not separate ``block.db`` or ``block.wal``. The -:ref:`ceph-volume-lvm` command for a single ``/dev/sda`` device looks like: +If all devices are of the same type (for example, they are all HDDs), and if +there are no fast devices available for the storage of metadata, then it makes +sense to specify the block device only and to leave ``block.db`` and +``block.wal`` unseparated. The :ref:`ceph-volume-lvm` command for a single +``/dev/sda`` device is as follows: .. prompt:: bash $ ceph-volume lvm create --bluestore --data /dev/sda -If logical volumes have already been created for each device, (a single LV -using 100% of the device), then the :ref:`ceph-volume-lvm` call for an LV named -``ceph-vg/block-lv`` would look like: +If the devices to be used for a BlueStore OSD are pre-created logical volumes, +then the :ref:`ceph-volume-lvm` call for an logical volume named +``ceph-vg/block-lv`` is as follows: .. prompt:: bash $ @@ -88,15 +99,18 @@ using 100% of the device), then the :ref:`ceph-volume-lvm` call for an LV named **block and block.db** ^^^^^^^^^^^^^^^^^^^^^^ -If you have a mix of fast and slow devices (SSD / NVMe and rotational), -it is recommended to place ``block.db`` on the faster device while ``block`` -(data) lives on the slower (spinning drive). -You must create these volume groups and logical volumes manually as -the ``ceph-volume`` tool is currently not able to do so automatically. +If you have a mix of fast and slow devices (for example, SSD or HDD), then we +recommend placing ``block.db`` on the faster device while ``block`` (that is, +the data) is stored on the slower device (that is, the rotational drive). -For the below example, let us assume four rotational (``sda``, ``sdb``, ``sdc``, and ``sdd``) -and one (fast) solid state drive (``sdx``). First create the volume groups: +You must create these volume groups and these logical volumes manually. as The +``ceph-volume`` tool is currently unable to do so [create them?] automatically. + +The following procedure illustrates the manual creation of volume groups and +logical volumes. For this example, we shall assume four rotational drives +(``sda``, ``sdb``, ``sdc``, and ``sdd``) and one (fast) SSD (``sdx``). First, +to create the volume groups, run the following commands: .. prompt:: bash $ @@ -105,7 +119,7 @@ and one (fast) solid state drive (``sdx``). First create the volume groups: vgcreate ceph-block-2 /dev/sdc vgcreate ceph-block-3 /dev/sdd -Now create the logical volumes for ``block``: +Next, to create the logical volumes for ``block``, run the following commands: .. prompt:: bash $ @@ -114,8 +128,9 @@ Now create the logical volumes for ``block``: lvcreate -l 100%FREE -n block-2 ceph-block-2 lvcreate -l 100%FREE -n block-3 ceph-block-3 -We are creating 4 OSDs for the four slow spinning devices, so assuming a 200GB -SSD in ``/dev/sdx`` we will create 4 logical volumes, each of 50GB: +Because there are four HDDs, there will be four OSDs. Supposing that there is a +200GB SSD in ``/dev/sdx``, we can create four 50GB logical volumes by running +the following commands: .. prompt:: bash $ @@ -125,7 +140,7 @@ SSD in ``/dev/sdx`` we will create 4 logical volumes, each of 50GB: lvcreate -L 50GB -n db-2 ceph-db-0 lvcreate -L 50GB -n db-3 ceph-db-0 -Finally, create the 4 OSDs with ``ceph-volume``: +Finally, to create the four OSDs, run the following commands: .. prompt:: bash $ @@ -134,149 +149,153 @@ Finally, create the 4 OSDs with ``ceph-volume``: ceph-volume lvm create --bluestore --data ceph-block-2/block-2 --block.db ceph-db-0/db-2 ceph-volume lvm create --bluestore --data ceph-block-3/block-3 --block.db ceph-db-0/db-3 -These operations should end up creating four OSDs, with ``block`` on the slower -rotational drives with a 50 GB logical volume (DB) for each on the solid state -drive. +After this procedure is finished, there should be four OSDs, ``block`` should +be on the four HDDs, and each HDD should have a 50GB logical volume +(specifically, a DB device) on the shared SSD. Sizing ====== -When using a :ref:`mixed spinning and solid drive setup -` it is important to make a large enough -``block.db`` logical volume for BlueStore. Generally, ``block.db`` should have -*as large as possible* logical volumes. +When using a :ref:`mixed spinning-and-solid-drive setup +`, it is important to make a large enough +``block.db`` logical volume for BlueStore. The logical volumes associated with +``block.db`` should have logical volumes that are *as large as possible*. -The general recommendation is to have ``block.db`` size in between 1% to 4% -of ``block`` size. For RGW workloads, it is recommended that the ``block.db`` -size isn't smaller than 4% of ``block``, because RGW heavily uses it to store -metadata (omap keys). For example, if the ``block`` size is 1TB, then ``block.db`` shouldn't -be less than 40GB. For RBD workloads, 1% to 2% of ``block`` size is usually enough. +It is generally recommended that the size of ``block.db`` be somewhere between +1% and 4% of the size of ``block``. For RGW workloads, it is recommended that +the ``block.db`` be at least 4% of the ``block`` size, because RGW makes heavy +use of ``block.db`` to store metadata (in particular, omap keys). For example, +if the ``block`` size is 1TB, then ``block.db`` should have a size of at least +40GB. For RBD workloads, however, ``block.db`` usually needs no more than 1% to +2% of the ``block`` size. -In older releases, internal level sizes mean that the DB can fully utilize only -specific partition / LV sizes that correspond to sums of L0, L0+L1, L1+L2, -etc. sizes, which with default settings means roughly 3 GB, 30 GB, 300 GB, and -so forth. Most deployments will not substantially benefit from sizing to -accommodate L3 and higher, though DB compaction can be facilitated by doubling -these figures to 6GB, 60GB, and 600GB. +In older releases, internal level sizes are such that the DB can fully utilize +only those specific partition / logical volume sizes that correspond to sums of +L0, L0+L1, L1+L2, and so on--that is, given default settings, sizes of roughly +3GB, 30GB, 300GB, and so on. Most deployments do not substantially benefit from +sizing that accommodates L3 and higher, though DB compaction can be facilitated +by doubling these figures to 6GB, 60GB, and 600GB. -Improvements in releases beginning with Nautilus 14.2.12 and Octopus 15.2.6 -enable better utilization of arbitrary DB device sizes, and the Pacific -release brings experimental dynamic level support. Users of older releases may -thus wish to plan ahead by provisioning larger DB devices today so that their -benefits may be realized with future upgrades. - -When *not* using a mix of fast and slow devices, it isn't required to create -separate logical volumes for ``block.db`` (or ``block.wal``). BlueStore will -automatically colocate these within the space of ``block``. +Improvements in Nautilus 14.2.12, Octopus 15.2.6, and subsequent releases allow +for better utilization of arbitrarily-sized DB devices. Moreover, the Pacific +release brings experimental dynamic-level support. Because of these advances, +users of older releases might want to plan ahead by provisioning larger DB +devices today so that the benefits of scale can be realized when upgrades are +made in the future. +When *not* using a mix of fast and slow devices, there is no requirement to +create separate logical volumes for ``block.db`` or ``block.wal``. BlueStore +will automatically colocate these devices within the space of ``block``. Automatic Cache Sizing ====================== -BlueStore can be configured to automatically resize its caches when TCMalloc -is configured as the memory allocator and the ``bluestore_cache_autotune`` -setting is enabled. This option is currently enabled by default. BlueStore -will attempt to keep OSD heap memory usage under a designated target size via -the ``osd_memory_target`` configuration option. This is a best effort -algorithm and caches will not shrink smaller than the amount specified by -``osd_memory_cache_min``. Cache ratios will be chosen based on a hierarchy -of priorities. If priority information is not available, the -``bluestore_cache_meta_ratio`` and ``bluestore_cache_kv_ratio`` options are -used as fallbacks. +BlueStore can be configured to automatically resize its caches, provided that +certain conditions are met: TCMalloc must be configured as the memory allocator +and the ``bluestore_cache_autotune`` configuration option must be enabled (note +that it is currently enabled by default). When automatic cache sizing is in +effect, BlueStore attempts to keep OSD heap-memory usage under a certain target +size (as determined by ``osd_memory_target``). This approach makes use of a +best-effort algorithm and caches do not shrink smaller than the size defined by +the value of ``osd_memory_cache_min``. Cache ratios are selected in accordance +with a hierarchy of priorities. But if priority information is not available, +the values specified in the ``bluestore_cache_meta_ratio`` and +``bluestore_cache_kv_ratio`` options are used as fallback cache ratios. + Manual Cache Sizing =================== -The amount of memory consumed by each OSD for BlueStore caches is -determined by the ``bluestore_cache_size`` configuration option. If -that config option is not set (i.e., remains at 0), there is a -different default value that is used depending on whether an HDD or -SSD is used for the primary device (set by the -``bluestore_cache_size_ssd`` and ``bluestore_cache_size_hdd`` config -options). +The amount of memory consumed by each OSD to be used for its BlueStore cache is +determined by the ``bluestore_cache_size`` configuration option. If that option +has not been specified (that is, if it remains at 0), then Ceph uses a +different configuration option to determine the default memory budget: +``bluestore_cache_size_hdd`` if the primary device is an HDD, or +``bluestore_cache_size_ssd`` if the primary device is an SSD. -BlueStore and the rest of the Ceph OSD daemon do the best they can -to work within this memory budget. Note that on top of the configured -cache size, there is also memory consumed by the OSD itself, and -some additional utilization due to memory fragmentation and other -allocator overhead. +BlueStore and the rest of the Ceph OSD daemon make every effort to work within +this memory budget. Note that in addition to the configured cache size, there +is also memory consumed by the OSD itself. There is additional utilization due +to memory fragmentation and other allocator overhead. -The configured cache memory budget can be used in a few different ways: +The configured cache-memory budget can be used to store the following types of +things: -* Key/Value metadata (i.e., RocksDB's internal cache) +* Key/Value metadata (that is, RocksDB's internal cache) * BlueStore metadata -* BlueStore data (i.e., recently read or written object data) +* BlueStore data (that is, recently read or recently written object data) -Cache memory usage is governed by the following options: -``bluestore_cache_meta_ratio`` and ``bluestore_cache_kv_ratio``. -The fraction of the cache devoted to data -is governed by the effective bluestore cache size (depending on -``bluestore_cache_size[_ssd|_hdd]`` settings and the device class of the primary -device) as well as the meta and kv ratios. -The data fraction can be calculated by -`` * (1 - bluestore_cache_meta_ratio - bluestore_cache_kv_ratio)`` +Cache memory usage is governed by the configuration options +``bluestore_cache_meta_ratio`` and ``bluestore_cache_kv_ratio``. The fraction +of the cache that is reserved for data is governed by both the effective +BlueStore cache size (which depends on the relevant +``bluestore_cache_size[_ssd|_hdd]`` option and the device class of the primary +device) and the "meta" and "kv" ratios. This data fraction can be calculated +with the following formula: `` * (1 - +bluestore_cache_meta_ratio - bluestore_cache_kv_ratio)``. Checksums ========= -BlueStore checksums all metadata and data written to disk. Metadata -checksumming is handled by RocksDB and uses `crc32c`. Data -checksumming is done by BlueStore and can make use of `crc32c`, -`xxhash32`, or `xxhash64`. The default is `crc32c` and should be -suitable for most purposes. +BlueStore checksums all metadata and all data written to disk. Metadata +checksumming is handled by RocksDB and uses the `crc32c` algorithm. By +contrast, data checksumming is handled by BlueStore and can use either +`crc32c`, `xxhash32`, or `xxhash64`. Nonetheless, `crc32c` is the default +checksum algorithm and it is suitable for most purposes. -Full data checksumming does increase the amount of metadata that -BlueStore must store and manage. When possible, e.g., when clients -hint that data is written and read sequentially, BlueStore will -checksum larger blocks, but in many cases it must store a checksum -value (usually 4 bytes) for every 4 kilobyte block of data. +Full data checksumming increases the amount of metadata that BlueStore must +store and manage. Whenever possible (for example, when clients hint that data +is written and read sequentially), BlueStore will checksum larger blocks. In +many cases, however, it must store a checksum value (usually 4 bytes) for every +4 KB block of data. -It is possible to use a smaller checksum value by truncating the -checksum to two or one byte, reducing the metadata overhead. The -trade-off is that the probability that a random error will not be -detected is higher with a smaller checksum, going from about one in -four billion with a 32-bit (4 byte) checksum to one in 65,536 for a -16-bit (2 byte) checksum or one in 256 for an 8-bit (1 byte) checksum. -The smaller checksum values can be used by selecting `crc32c_16` or -`crc32c_8` as the checksum algorithm. +It is possible to obtain a smaller checksum value by truncating the checksum to +one or two bytes and reducing the metadata overhead. A drawback of this +approach is that it increases the probability of a random error going +undetected: about one in four billion given a 32-bit (4 byte) checksum, 1 in +65,536 given a 16-bit (2 byte) checksum, and 1 in 256 given an 8-bit (1 byte) +checksum. To use the smaller checksum values, select `crc32c_16` or `crc32c_8` +as the checksum algorithm. -The *checksum algorithm* can be set either via a per-pool -``csum_type`` property or the global config option. For example: +The *checksum algorithm* can be specified either via a per-pool ``csum_type`` +configuration option or via the global configuration option. For example: .. prompt:: bash $ ceph osd pool set csum_type + Inline Compression ================== -BlueStore supports inline compression using `snappy`, `zlib`, or -`lz4`. Please note that the `lz4` compression plugin is not -distributed in the official release. +BlueStore supports inline compression using `snappy`, `zlib`, `lz4`, or `zstd`. -Whether data in BlueStore is compressed is determined by a combination -of the *compression mode* and any hints associated with a write -operation. The modes are: +Whether data in BlueStore is compressed is determined by two factors: (1) the +*compression mode* and (2) any client hints associated with a write operation. +The compression modes are as follows: * **none**: Never compress data. * **passive**: Do not compress data unless the write operation has a *compressible* hint set. -* **aggressive**: Compress data unless the write operation has an +* **aggressive**: Do compress data unless the write operation has an *incompressible* hint set. * **force**: Try to compress data no matter what. -For more information about the *compressible* and *incompressible* IO -hints, see :c:func:`rados_set_alloc_hint`. +For more information about the *compressible* and *incompressible* I/O hints, +see :c:func:`rados_set_alloc_hint`. -Note that regardless of the mode, if the size of the data chunk is not -reduced sufficiently it will not be used and the original -(uncompressed) data will be stored. For example, if the ``bluestore -compression required ratio`` is set to ``.7`` then the compressed data -must be 70% of the size of the original (or smaller). +Note that data in Bluestore will be compressed only if the data chunk will be +sufficiently reduced in size (as determined by the ``bluestore compression +required ratio`` setting). No matter which compression modes have been used, if +the data chunk is too big, then it will be discarded and the original +(uncompressed) data will be stored instead. For example, if ``bluestore +compression required ratio`` is set to ``.7``, then data compression will take +place only if the size of the compressed data is no more than 70% of the size +of the original data. -The *compression mode*, *compression algorithm*, *compression required -ratio*, *min blob size*, and *max blob size* can be set either via a -per-pool property or a global config option. Pool properties can be -set with: +The *compression mode*, *compression algorithm*, *compression required ratio*, +*min blob size*, and *max blob size* settings can be specified either via a +per-pool property or via a global config option. To specify pool properties, +run the following commands: .. prompt:: bash $ @@ -291,192 +310,202 @@ set with: RocksDB Sharding ================ -Internally BlueStore uses multiple types of key-value data, -stored in RocksDB. Each data type in BlueStore is assigned a -unique prefix. Until Pacific all key-value data was stored in -single RocksDB column family: 'default'. Since Pacific, -BlueStore can divide this data into multiple RocksDB column -families. When keys have similar access frequency, modification -frequency and lifetime, BlueStore benefits from better caching -and more precise compaction. This improves performance, and also -requires less disk space during compaction, since each column -family is smaller and can compact independent of others. +BlueStore maintains several types of internal key-value data, all of which are +stored in RocksDB. Each data type in BlueStore is assigned a unique prefix. +Prior to the Pacific release, all key-value data was stored in a single RocksDB +column family: 'default'. In Pacific and later releases, however, BlueStore can +divide key-value data into several RocksDB column families. BlueStore achieves +better caching and more precise compaction when keys are similar: specifically, +when keys have similar access frequency, similar modification frequency, and a +similar lifetime. Under such conditions, performance is improved and less disk +space is required during compaction (because each column family is smaller and +is able to compact independently of the others). -OSDs deployed in Pacific or later use RocksDB sharding by default. -If Ceph is upgraded to Pacific from a previous version, sharding is off. +OSDs deployed in Pacific or later releases use RocksDB sharding by default. +However, if Ceph has been upgraded to Pacific or a later version from a +previous version, sharding is disabled on any OSDs that were created before +Pacific. -To enable sharding and apply the Pacific defaults, stop an OSD and run +To enable sharding and apply the Pacific defaults to a specific OSD, stop the +OSD and run the following command: .. prompt:: bash # - ceph-bluestore-tool \ + ceph-bluestore-tool \ --path \ - --sharding="m(3) p(3,0-12) O(3,0-13)=block_cache={type=binned_lru} L P" \ + --sharding="m(3) p(3,0-12) o(3,0-13)=block_cache={type=binned_lru} l p" \ reshard -Throttling +SPDK Usage ========== -SPDK Usage -================== - -If you want to use the SPDK driver for NVMe devices, you must prepare your system. -Refer to `SPDK document`__ for more details. +To use the SPDK driver for NVMe devices, you must first prepare your system. +See `SPDK document`__. .. __: http://www.spdk.io/doc/getting_started.html#getting_started_examples -SPDK offers a script to configure the device automatically. Users can run the -script as root: +SPDK offers a script that will configure the device automatically. Run this +script with root permissions: .. prompt:: bash $ sudo src/spdk/scripts/setup.sh -You will need to specify the subject NVMe device's device selector with -the "spdk:" prefix for ``bluestore_block_path``. +You will need to specify the subject NVMe device's device selector with the +"spdk:" prefix for ``bluestore_block_path``. -For example, you can find the device selector of an Intel PCIe SSD with: +In the following example, you first find the device selector of an Intel NVMe +SSD by running the following command: .. prompt:: bash $ - lspci -mm -n -D -d 8086:0953 + lspci -mm -n -d -d 8086:0953 -The device selector always has the form of ``DDDD:BB:DD.FF`` or ``DDDD.BB.DD.FF``. +The form of the device selector is either ``DDDD:BB:DD.FF`` or +``DDDD.BB.DD.FF``. -and then set:: +Next, supposing that ``0000:01:00.0`` is the device selector found in the +output of the ``lspci`` command, you can specify the device selector by running +the following command:: - bluestore_block_path = "spdk:trtype:PCIe traddr:0000:01:00.0" + bluestore_block_path = "spdk:trtype:pcie traddr:0000:01:00.0" -Where ``0000:01:00.0`` is the device selector found in the output of ``lspci`` -command above. - -You may also specify a remote NVMeoF target over the TCP transport as in the +You may also specify a remote NVMeoF target over the TCP transport, as in the following example:: - bluestore_block_path = "spdk:trtype:TCP traddr:10.67.110.197 trsvcid:4420 subnqn:nqn.2019-02.io.spdk:cnode1" + bluestore_block_path = "spdk:trtype:tcp traddr:10.67.110.197 trsvcid:4420 subnqn:nqn.2019-02.io.spdk:cnode1" -To run multiple SPDK instances per node, you must specify the -amount of dpdk memory in MB that each instance will use, to make sure each -instance uses its own DPDK memory. +To run multiple SPDK instances per node, you must make sure each instance uses +its own DPDK memory by specifying for each instance the amount of DPDK memory +(in MB) that the instance will use. -In most cases, a single device can be used for data, DB, and WAL. We describe +In most cases, a single device can be used for data, DB, and WAL. We describe this strategy as *colocating* these components. Be sure to enter the below -settings to ensure that all IOs are issued through SPDK.:: +settings to ensure that all I/Os are issued through SPDK:: bluestore_block_db_path = "" bluestore_block_db_size = 0 bluestore_block_wal_path = "" bluestore_block_wal_size = 0 -Otherwise, the current implementation will populate the SPDK map files with -kernel file system symbols and will use the kernel driver to issue DB/WAL IO. +If these settings are not entered, then the current implementation will +populate the SPDK map files with kernel file system symbols and will use the +kernel driver to issue DB/WAL I/Os. Minimum Allocation Size -======================== +======================= -There is a configured minimum amount of storage that BlueStore will allocate on -an OSD. In practice, this is the least amount of capacity that a RADOS object -can consume. The value of `bluestore_min_alloc_size` is derived from the -value of `bluestore_min_alloc_size_hdd` or `bluestore_min_alloc_size_ssd` -depending on the OSD's ``rotational`` attribute. This means that when an OSD -is created on an HDD, BlueStore will be initialized with the current value -of `bluestore_min_alloc_size_hdd`, and SSD OSDs (including NVMe devices) -with the value of `bluestore_min_alloc_size_ssd`. +There is a configured minimum amount of storage that BlueStore allocates on an +underlying storage device. In practice, this is the least amount of capacity +that even a tiny RADOS object can consume on each OSD's primary device. The +configuration option in question-- ``bluestore_min_alloc_size`` --derives +its value from the value of either ``bluestore_min_alloc_size_hdd`` or +``bluestore_min_alloc_size_ssd``, depending on the OSD's ``rotational`` +attribute. Thus if an OSD is created on an HDD, BlueStore is initialized with +the current value of ``bluestore_min_alloc_size_hdd``; but with SSD OSDs +(including NVMe devices), Bluestore is initialized with the current value of +``bluestore_min_alloc_size_ssd``. -Through the Mimic release, the default values were 64KB and 16KB for rotational -(HDD) and non-rotational (SSD) media respectively. Octopus changed the default -for SSD (non-rotational) media to 4KB, and Pacific changed the default for HDD -(rotational) media to 4KB as well. +In Mimic and earlier releases, the default values were 64KB for rotational +media (HDD) and 16KB for non-rotational media (SSD). The Octopus release +changed the the default value for non-rotational media (SSD) to 4KB, and the +Pacific release changed the default value for rotational media (HDD) to 4KB. -These changes were driven by space amplification experienced by Ceph RADOS -GateWay (RGW) deployments that host large numbers of small files +These changes were driven by space amplification that was experienced by Ceph +RADOS GateWay (RGW) deployments that hosted large numbers of small files (S3/Swift objects). -For example, when an RGW client stores a 1KB S3 object, it is written to a -single RADOS object. With the default `min_alloc_size` value, 4KB of -underlying drive space is allocated. This means that roughly -(4KB - 1KB) == 3KB is allocated but never used, which corresponds to 300% -overhead or 25% efficiency. Similarly, a 5KB user object will be stored -as one 4KB and one 1KB RADOS object, again stranding 4KB of device capcity, -though in this case the overhead is a much smaller percentage. Think of this -in terms of the remainder from a modulus operation. The overhead *percentage* -thus decreases rapidly as user object size increases. +For example, when an RGW client stores a 1 KB S3 object, that object is written +to a single RADOS object. In accordance with the default +``min_alloc_size`` value, 4 KB of underlying drive space is allocated. +This means that roughly 3 KB (that is, 4 KB minus 1 KB) is allocated but never +used: this corresponds to 300% overhead or 25% efficiency. Similarly, a 5 KB +user object will be stored as two RADOS objects, a 4 KB RADOS object and a 1 KB +RADOS object, with the result that 4KB of device capacity is stranded. In this +case, however, the overhead percentage is much smaller. Think of this in terms +of the remainder from a modulus operation. The overhead *percentage* thus +decreases rapidly as object size increases. -An easily missed additional subtlety is that this -takes place for *each* replica. So when using the default three copies of -data (3R), a 1KB S3 object actually consumes roughly 9KB of storage device -capacity. If erasure coding (EC) is used instead of replication, the -amplification may be even higher: for a ``k=4,m=2`` pool, our 1KB S3 object -will allocate (6 * 4KB) = 24KB of device capacity. +There is an additional subtlety that is easily missed: the amplification +phenomenon just described takes place for *each* replica. For example, when +using the default of three copies of data (3R), a 1 KB S3 object actually +strands roughly 9 KB of storage device capacity. If erasure coding (EC) is used +instead of replication, the amplification might be even higher: for a ``k=4, +m=2`` pool, our 1 KB S3 object allocates 24 KB (that is, 4 KB multiplied by 6) +of device capacity. When an RGW bucket pool contains many relatively large user objects, the effect -of this phenomenon is often negligible, but should be considered for deployments -that expect a signficiant fraction of relatively small objects. +of this phenomenon is often negligible. However, with deployments that can +expect a significant fraction of relatively small user objects, the effect +should be taken into consideration. -The 4KB default value aligns well with conventional HDD and SSD devices. Some -new coarse-IU (Indirection Unit) QLC SSDs however perform and wear best -when `bluestore_min_alloc_size_ssd` -is set at OSD creation to match the device's IU:. 8KB, 16KB, or even 64KB. -These novel storage drives allow one to achieve read performance competitive -with conventional TLC SSDs and write performance faster than HDDs, with -high density and lower cost than TLC SSDs. +The 4KB default value aligns well with conventional HDD and SSD devices. +However, certain novel coarse-IU (Indirection Unit) QLC SSDs perform and wear +best when ``bluestore_min_alloc_size_ssd`` is specified at OSD creation +to match the device's IU: this might be 8KB, 16KB, or even 64KB. These novel +storage drives can achieve read performance that is competitive with that of +conventional TLC SSDs and write performance that is faster than that of HDDs, +with higher density and lower cost than TLC SSDs. -Note that when creating OSDs on these devices, one must carefully apply the -non-default value only to appropriate devices, and not to conventional SSD and -HDD devices. This may be done through careful ordering of OSD creation, custom -OSD device classes, and especially by the use of central configuration _masks_. +Note that when creating OSDs on these novel devices, one must be careful to +apply the non-default value only to appropriate devices, and not to +conventional HDD and SSD devices. Error can be avoided through careful ordering +of OSD creation, with custom OSD device classes, and especially by the use of +central configuration *masks*. -Quincy and later releases add -the `bluestore_use_optimal_io_size_for_min_alloc_size` -option that enables automatic discovery of the appropriate value as each OSD is -created. Note that the use of ``bcache``, ``OpenCAS``, ``dmcrypt``, -``ATA over Ethernet``, `iSCSI`, or other device layering / abstraction -technologies may confound the determination of appropriate values. OSDs -deployed on top of VMware storage have been reported to also -sometimes report a ``rotational`` attribute that does not match the underlying -hardware. +In Quincy and later releases, you can use the +``bluestore_use_optimal_io_size_for_min_alloc_size`` option to allow +automatic discovery of the correct value as each OSD is created. Note that the +use of ``bcache``, ``OpenCAS``, ``dmcrypt``, ``ATA over Ethernet``, `iSCSI`, or +other device-layering and abstraction technologies might confound the +determination of correct values. Moreover, OSDs deployed on top of VMware +storage have sometimes been found to report a ``rotational`` attribute that +does not match the underlying hardware. -We suggest inspecting such OSDs at startup via logs and admin sockets to ensure that -behavior is appropriate. Note that this also may not work as desired with -older kernels. You can check for this by examining the presence and value -of ``/sys/block//queue/optimal_io_size``. +We suggest inspecting such OSDs at startup via logs and admin sockets in order +to ensure that their behavior is correct. Be aware that this kind of inspection +might not work as expected with older kernels. To check for this issue, +examine the presence and value of ``/sys/block//queue/optimal_io_size``. -You may also inspect a given OSD: +.. note:: When running Reef or a later Ceph release, the ``min_alloc_size`` + baked into each OSD is conveniently reported by ``ceph osd metadata``. + +To inspect a specific OSD, run the following command: .. prompt:: bash # - ceph osd metadata osd.1701 | grep rotational + ceph osd metadata osd.1701 | egrep rotational\|alloc -This space amplification may manifest as an unusually high ratio of raw to -stored data reported by ``ceph df``. ``ceph osd df`` may also report -anomalously high ``%USE`` / ``VAR`` values when -compared to other, ostensibly identical OSDs. A pool using OSDs with -mismatched ``min_alloc_size`` values may experience unexpected balancer -behavior as well. +This space amplification might manifest as an unusually high ratio of raw to +stored data as reported by ``ceph df``. There might also be ``%USE`` / ``VAR`` +values reported by ``ceph osd df`` that are unusually high in comparison to +other, ostensibly identical, OSDs. Finally, there might be unexpected balancer +behavior in pools that use OSDs that have mismatched ``min_alloc_size`` values. -Note that this BlueStore attribute takes effect *only* at OSD creation; if -changed later, a given OSD's behavior will not change unless / until it is -destroyed and redeployed with the appropriate option value(s). Upgrading -to a later Ceph release will *not* change the value used by OSDs deployed -under older releases or with other settings. +This BlueStore attribute takes effect *only* at OSD creation; if the attribute +is changed later, a specific OSD's behavior will not change unless and until +the OSD is destroyed and redeployed with the appropriate option value(s). +Upgrading to a later Ceph release will *not* change the value used by OSDs that +were deployed under older releases or with other settings. -DSA (Data Streaming Accelerator Usage) +DSA (Data Streaming Accelerator) Usage ====================================== -If you want to use the DML library to drive DSA device for offloading -read/write operations on Persist memory in Bluestore. You need to install -`DML`_ and `idxd-config`_ library in your machine with SPR (Sapphire Rapids) CPU. +If you want to use the DML library to drive the DSA device for offloading +read/write operations on persistent memory (PMEM) in BlueStore, you need to +install `DML`_ and the `idxd-config`_ library. This will work only on machines +that have a SPR (Sapphire Rapids) CPU. -.. _DML: https://github.com/intel/DML +.. _dml: https://github.com/intel/dml .. _idxd-config: https://github.com/intel/idxd-config -After installing the DML software, you need to configure the shared -work queues (WQs) with the following WQ configuration example via accel-config tool: +After installing the DML software, configure the shared work queues (WQs) with +reference to the following WQ configuration example: .. prompt:: bash $ - accel-config config-wq --group-id=1 --mode=shared --wq-size=16 --threshold=15 --type=user --name="MyApp1" --priority=10 --block-on-fault=1 dsa0/wq0.1 + accel-config config-wq --group-id=1 --mode=shared --wq-size=16 --threshold=15 --type=user --name="myapp1" --priority=10 --block-on-fault=1 dsa0/wq0.1 accel-config config-engine dsa0/engine0.1 --group-id=1 accel-config enable-device dsa0 accel-config enable-wq dsa0/wq0.1 diff --git a/ceph/doc/rados/configuration/common.rst b/ceph/doc/rados/configuration/common.rst index 118535b67..174b35579 100644 --- a/ceph/doc/rados/configuration/common.rst +++ b/ceph/doc/rados/configuration/common.rst @@ -218,4 +218,4 @@ If you need to allow multiple clusters to exist on the same host, use .. _Hardware Recommendations: ../../../start/hardware-recommendations .. _Network Configuration Reference: ../network-config-ref .. _OSD Config Reference: ../osd-config-ref -.. _Configuring Monitor/OSD Interaction: ../mon-osd-interactio +.. _Configuring Monitor/OSD Interaction: ../mon-osd-interaction diff --git a/ceph/doc/rados/configuration/filestore-config-ref.rst b/ceph/doc/rados/configuration/filestore-config-ref.rst index 435a800a8..e06b5dc76 100644 --- a/ceph/doc/rados/configuration/filestore-config-ref.rst +++ b/ceph/doc/rados/configuration/filestore-config-ref.rst @@ -2,8 +2,14 @@ Filestore Config Reference ============================ -The Filestore back end is no longer the default when creating new OSDs, -though Filestore OSDs are still supported. +.. note:: Since the Luminous release of Ceph, Filestore has not been Ceph's + default storage back end. Since the Luminous release of Ceph, BlueStore has + been Ceph's default storage back end. However, Filestore OSDs are still + supported. See :ref:`OSD Back Ends + `. See :ref:`BlueStore Migration + ` for instructions explaining how to + replace an existing Filestore back end with a BlueStore back end. + ``filestore debug omap check`` @@ -18,26 +24,31 @@ though Filestore OSDs are still supported. Extended Attributes =================== -Extended Attributes (XATTRs) are important for Filestore OSDs. -Some file systems have limits on the number of bytes that can be stored in XATTRs. -Additionally, in some cases, the file system may not be as fast as an alternative -method of storing XATTRs. The following settings may help improve performance -by using a method of storing XATTRs that is extrinsic to the underlying file system. +Extended Attributes (XATTRs) are important for Filestore OSDs. However, Certain +disadvantages can occur when the underlying file system is used for the storage +of XATTRs: some file systems have limits on the number of bytes that can be +stored in XATTRs, and your file system might in some cases therefore run slower +than would an alternative method of storing XATTRs. For this reason, a method +of storing XATTRs extrinsic to the underlying file system might improve +performance. To implement such an extrinsic method, refer to the following +settings. -Ceph XATTRs are stored as ``inline xattr``, using the XATTRs provided -by the underlying file system, if it does not impose a size limit. If -there is a size limit (4KB total on ext4, for instance), some Ceph -XATTRs will be stored in a key/value database when either the +If the underlying file system has no size limit, then Ceph XATTRs are stored as +``inline xattr``, using the XATTRs provided by the file system. But if there is +a size limit (for example, ext4 imposes a limit of 4 KB total), then some Ceph +XATTRs will be stored in a key/value database when the limit is reached. More +precisely, this begins to occur when either the ``filestore_max_inline_xattr_size`` or ``filestore_max_inline_xattrs`` threshold is reached. ``filestore_max_inline_xattr_size`` -:Description: The maximum size of an XATTR stored in the file system (i.e., XFS, - Btrfs, EXT4, etc.) per object. Should not be larger than the - file system can handle. Default value of 0 means to use the value - specific to the underlying file system. +:Description: Defines the maximum size per object of an XATTR that can be + stored in the file system (for example, XFS, Btrfs, ext4). The + specified size should not be larger than the file system can + handle. Using the default value of 0 instructs Filestore to use + the value specific to the file system. :Type: Unsigned 32-bit Integer :Required: No :Default: ``0`` @@ -45,8 +56,9 @@ threshold is reached. ``filestore_max_inline_xattr_size_xfs`` -:Description: The maximum size of an XATTR stored in the XFS file system. - Only used if ``filestore_max_inline_xattr_size`` == 0. +:Description: Defines the maximum size of an XATTR that can be stored in the + XFS file system. This setting is used only if + ``filestore_max_inline_xattr_size`` == 0. :Type: Unsigned 32-bit Integer :Required: No :Default: ``65536`` @@ -54,8 +66,9 @@ threshold is reached. ``filestore_max_inline_xattr_size_btrfs`` -:Description: The maximum size of an XATTR stored in the Btrfs file system. - Only used if ``filestore_max_inline_xattr_size`` == 0. +:Description: Defines the maximum size of an XATTR that can be stored in the + Btrfs file system. This setting is used only if + ``filestore_max_inline_xattr_size`` == 0. :Type: Unsigned 32-bit Integer :Required: No :Default: ``2048`` @@ -63,8 +76,8 @@ threshold is reached. ``filestore_max_inline_xattr_size_other`` -:Description: The maximum size of an XATTR stored in other file systems. - Only used if ``filestore_max_inline_xattr_size`` == 0. +:Description: Defines the maximum size of an XATTR that can be stored in other file systems. + This setting is used only if ``filestore_max_inline_xattr_size`` == 0. :Type: Unsigned 32-bit Integer :Required: No :Default: ``512`` @@ -72,9 +85,8 @@ threshold is reached. ``filestore_max_inline_xattrs`` -:Description: The maximum number of XATTRs stored in the file system per object. - Default value of 0 means to use the value specific to the - underlying file system. +:Description: Defines the maximum number of XATTRs per object that can be stored in the file system. + Using the default value of 0 instructs Filestore to use the value specific to the file system. :Type: 32-bit Integer :Required: No :Default: ``0`` @@ -82,8 +94,8 @@ threshold is reached. ``filestore_max_inline_xattrs_xfs`` -:Description: The maximum number of XATTRs stored in the XFS file system per object. - Only used if ``filestore_max_inline_xattrs`` == 0. +:Description: Defines the maximum number of XATTRs per object that can be stored in the XFS file system. + This setting is used only if ``filestore_max_inline_xattrs`` == 0. :Type: 32-bit Integer :Required: No :Default: ``10`` @@ -91,8 +103,8 @@ threshold is reached. ``filestore_max_inline_xattrs_btrfs`` -:Description: The maximum number of XATTRs stored in the Btrfs file system per object. - Only used if ``filestore_max_inline_xattrs`` == 0. +:Description: Defines the maximum number of XATTRs per object that can be stored in the Btrfs file system. + This setting is used only if ``filestore_max_inline_xattrs`` == 0. :Type: 32-bit Integer :Required: No :Default: ``10`` @@ -100,8 +112,8 @@ threshold is reached. ``filestore_max_inline_xattrs_other`` -:Description: The maximum number of XATTRs stored in other file systems per object. - Only used if ``filestore_max_inline_xattrs`` == 0. +:Description: Defines the maximum number of XATTRs per object that can be stored in other file systems. + This setting is used only if ``filestore_max_inline_xattrs`` == 0. :Type: 32-bit Integer :Required: No :Default: ``2`` @@ -111,18 +123,19 @@ threshold is reached. Synchronization Intervals ========================= -Filestore needs to periodically quiesce writes and synchronize the -file system, which creates a consistent commit point. It can then free journal -entries up to the commit point. Synchronizing more frequently tends to reduce -the time required to perform synchronization, and reduces the amount of data -that needs to remain in the journal. Less frequent synchronization allows the -backing file system to coalesce small writes and metadata updates more -optimally, potentially resulting in more efficient synchronization at the -expense of potentially increasing tail latency. +Filestore must periodically quiesce writes and synchronize the file system. +Each synchronization creates a consistent commit point. When the commit point +is created, Filestore is able to free all journal entries up to that point. +More-frequent synchronization tends to reduce both synchronization time and +the amount of data that needs to remain in the journal. Less-frequent +synchronization allows the backing file system to coalesce small writes and +metadata updates, potentially increasing synchronization +efficiency but also potentially increasing tail latency. + ``filestore_max_sync_interval`` -:Description: The maximum interval in seconds for synchronizing Filestore. +:Description: Defines the maximum interval (in seconds) for synchronizing Filestore. :Type: Double :Required: No :Default: ``5`` @@ -130,7 +143,7 @@ expense of potentially increasing tail latency. ``filestore_min_sync_interval`` -:Description: The minimum interval in seconds for synchronizing Filestore. +:Description: Defines the minimum interval (in seconds) for synchronizing Filestore. :Type: Double :Required: No :Default: ``.01`` @@ -142,14 +155,14 @@ Flusher ======= The Filestore flusher forces data from large writes to be written out using -``sync_file_range`` before the sync in order to (hopefully) reduce the cost of -the eventual sync. In practice, disabling 'filestore_flusher' seems to improve -performance in some cases. +``sync_file_range`` prior to the synchronization. +Ideally, this action reduces the cost of the eventual synchronization. In practice, however, disabling +'filestore_flusher' seems in some cases to improve performance. ``filestore_flusher`` -:Description: Enables the filestore flusher. +:Description: Enables the Filestore flusher. :Type: Boolean :Required: No :Default: ``false`` @@ -158,7 +171,7 @@ performance in some cases. ``filestore_flusher_max_fds`` -:Description: Sets the maximum number of file descriptors for the flusher. +:Description: Defines the maximum number of file descriptors for the flusher. :Type: Integer :Required: No :Default: ``512`` @@ -176,7 +189,7 @@ performance in some cases. ``filestore_fsync_flushes_journal_data`` -:Description: Flush journal data during file system synchronization. +:Description: Flushes journal data during file-system synchronization. :Type: Boolean :Required: No :Default: ``false`` @@ -187,11 +200,11 @@ performance in some cases. Queue ===== -The following settings provide limits on the size of the Filestore queue. +The following settings define limits on the size of the Filestore queue: ``filestore_queue_max_ops`` -:Description: Defines the maximum number of in progress operations the file store accepts before blocking on queuing new operations. +:Description: Defines the maximum number of in-progress operations that Filestore accepts before it blocks the queueing of any new operations. :Type: Integer :Required: No. Minimal impact on performance. :Default: ``50`` @@ -199,23 +212,20 @@ The following settings provide limits on the size of the Filestore queue. ``filestore_queue_max_bytes`` -:Description: The maximum number of bytes for an operation. +:Description: Defines the maximum number of bytes permitted per operation. :Type: Integer :Required: No :Default: ``100 << 20`` - - .. index:: filestore; timeouts Timeouts ======== - ``filestore_op_threads`` -:Description: The number of file system operation threads that execute in parallel. +:Description: Defines the number of file-system operation threads that execute in parallel. :Type: Integer :Required: No :Default: ``2`` @@ -223,7 +233,7 @@ Timeouts ``filestore_op_thread_timeout`` -:Description: The timeout for a file system operation thread (in seconds). +:Description: Defines the timeout (in seconds) for a file-system operation thread. :Type: Integer :Required: No :Default: ``60`` @@ -231,7 +241,7 @@ Timeouts ``filestore_op_thread_suicide_timeout`` -:Description: The timeout for a commit operation before cancelling the commit (in seconds). +:Description: Defines the timeout (in seconds) for a commit operation before the commit is cancelled. :Type: Integer :Required: No :Default: ``180`` @@ -245,17 +255,17 @@ B-Tree Filesystem ``filestore_btrfs_snap`` -:Description: Enable snapshots for a ``btrfs`` filestore. +:Description: Enables snapshots for a ``btrfs`` Filestore. :Type: Boolean -:Required: No. Only used for ``btrfs``. +:Required: No. Used only for ``btrfs``. :Default: ``true`` ``filestore_btrfs_clone_range`` -:Description: Enable cloning ranges for a ``btrfs`` filestore. +:Description: Enables cloning ranges for a ``btrfs`` Filestore. :Type: Boolean -:Required: No. Only used for ``btrfs``. +:Required: No. Used only for ``btrfs``. :Default: ``true`` @@ -267,7 +277,7 @@ Journal ``filestore_journal_parallel`` -:Description: Enables parallel journaling, default for Btrfs. +:Description: Enables parallel journaling, default for ``btrfs``. :Type: Boolean :Required: No :Default: ``false`` @@ -275,7 +285,7 @@ Journal ``filestore_journal_writeahead`` -:Description: Enables writeahead journaling, default for XFS. +:Description: Enables write-ahead journaling, default for XFS. :Type: Boolean :Required: No :Default: ``false`` @@ -283,7 +293,7 @@ Journal ``filestore_journal_trailing`` -:Description: Deprecated, never use. +:Description: Deprecated. **Never use.** :Type: Boolean :Required: No :Default: ``false`` @@ -295,8 +305,8 @@ Misc ``filestore_merge_threshold`` -:Description: Min number of files in a subdir before merging into parent - NOTE: A negative value means to disable subdir merging +:Description: Defines the minimum number of files permitted in a subdirectory before the subdirectory is merged into its parent directory. + NOTE: A negative value means that subdirectory merging is disabled. :Type: Integer :Required: No :Default: ``-10`` @@ -305,8 +315,8 @@ Misc ``filestore_split_multiple`` :Description: ``(filestore_split_multiple * abs(filestore_merge_threshold) + (rand() % filestore_split_rand_factor)) * 16`` - is the maximum number of files in a subdirectory before - splitting into child directories. + is the maximum number of files permitted in a subdirectory + before the subdirectory is split into child directories. :Type: Integer :Required: No @@ -316,10 +326,10 @@ Misc ``filestore_split_rand_factor`` :Description: A random factor added to the split threshold to avoid - too many (expensive) Filestore splits occurring at once. See - ``filestore_split_multiple`` for details. - This can only be changed offline for an existing OSD, - via the ``ceph-objectstore-tool apply-layout-settings`` command. + too many (expensive) Filestore splits occurring at the same time. + For details, see ``filestore_split_multiple``. + To change this setting for an existing OSD, it is necessary to take the OSD + offline before running the ``ceph-objectstore-tool apply-layout-settings`` command. :Type: Unsigned 32-bit Integer :Required: No @@ -328,7 +338,7 @@ Misc ``filestore_update_to`` -:Description: Limits Filestore auto upgrade to specified version. +:Description: Limits automatic upgrades to a specified version of Filestore. Useful in cases in which you want to avoid upgrading to a specific version. :Type: Integer :Required: No :Default: ``1000`` @@ -336,7 +346,7 @@ Misc ``filestore_blackhole`` -:Description: Drop any new transactions on the floor. +:Description: Drops any new transactions on the floor, similar to redirecting to NULL. :Type: Boolean :Required: No :Default: ``false`` @@ -344,7 +354,7 @@ Misc ``filestore_dump_file`` -:Description: File onto which store transaction dumps. +:Description: Defines the file that transaction dumps are stored on. :Type: Boolean :Required: No :Default: ``false`` @@ -352,7 +362,7 @@ Misc ``filestore_kill_at`` -:Description: inject a failure at the n'th opportunity +:Description: Injects a failure at the *n*\th opportunity. :Type: String :Required: No :Default: ``false`` @@ -360,8 +370,7 @@ Misc ``filestore_fail_eio`` -:Description: Fail/Crash on eio. +:Description: Fail/Crash on EIO. :Type: Boolean :Required: No :Default: ``true`` - diff --git a/ceph/doc/rados/configuration/mon-config-ref.rst b/ceph/doc/rados/configuration/mon-config-ref.rst index 3b12af43d..cafcf2c38 100644 --- a/ceph/doc/rados/configuration/mon-config-ref.rst +++ b/ceph/doc/rados/configuration/mon-config-ref.rst @@ -16,24 +16,29 @@ consistent, but you can add, remove or replace a monitor in a cluster. See Background ========== -Ceph Monitors maintain a "master copy" of the :term:`Cluster Map`, which means a -:term:`Ceph Client` can determine the location of all Ceph Monitors, Ceph OSD -Daemons, and Ceph Metadata Servers just by connecting to one Ceph Monitor and -retrieving a current cluster map. Before Ceph Clients can read from or write to -Ceph OSD Daemons or Ceph Metadata Servers, they must connect to a Ceph Monitor -first. With a current copy of the cluster map and the CRUSH algorithm, a Ceph -Client can compute the location for any object. The ability to compute object -locations allows a Ceph Client to talk directly to Ceph OSD Daemons, which is a -very important aspect of Ceph's high scalability and performance. See -`Scalability and High Availability`_ for additional details. +Ceph Monitors maintain a "master copy" of the :term:`Cluster Map`. -The primary role of the Ceph Monitor is to maintain a master copy of the cluster -map. Ceph Monitors also provide authentication and logging services. Ceph -Monitors write all changes in the monitor services to a single Paxos instance, -and Paxos writes the changes to a key/value store for strong consistency. Ceph -Monitors can query the most recent version of the cluster map during sync -operations. Ceph Monitors leverage the key/value store's snapshots and iterators -(using leveldb) to perform store-wide synchronization. +The maintenance by Ceph Monitors of a :term:`Cluster Map` makes it possible for +a :term:`Ceph Client` to determine the location of all Ceph Monitors, Ceph OSD +Daemons, and Ceph Metadata Servers by connecting to one Ceph Monitor and +retrieving a current cluster map. Before Ceph Clients can read from or write to +Ceph OSD Daemons or Ceph Metadata Servers, they must connect to a Ceph Monitor. +When a Ceph client has a current copy of the cluster map and the CRUSH +algorithm, it can compute the location for any RADOS object within in the +cluster. This ability to compute the locations of objects makes it possible for +Ceph Clients to talk directly to Ceph OSD Daemons. This direct communication +with Ceph OSD Daemons represents an improvment upon traditional storage +architectures in which clients were required to communicate with a central +component, and that improvment contributes to Ceph's high scalability and +performance. See `Scalability and High Availability`_ for additional details. + +The Ceph Monitor's primary function is to maintain a master copy of the cluster +map. Monitors also provide authentication and logging services. All changes in +the monitor services are written by the Ceph Monitor to a single Paxos +instance, and Paxos writes the changes to a key/value store for strong +consistency. Ceph Monitors are able to query the most recent version of the +cluster map during sync operations, and they use the key/value store's +snapshots and iterators (using leveldb) to perform store-wide synchronization. .. ditaa:: /-------------\ /-------------\ @@ -56,12 +61,6 @@ operations. Ceph Monitors leverage the key/value store's snapshots and iterators | cCCC |*---------------------+ \-------------/ - -.. deprecated:: version 0.58 - -In Ceph versions 0.58 and earlier, Ceph Monitors use a Paxos instance for -each service and store the map as a file. - .. index:: Ceph Monitor; cluster map Cluster Maps diff --git a/ceph/doc/rados/configuration/storage-devices.rst b/ceph/doc/rados/configuration/storage-devices.rst index 8536d2cfa..f576d5683 100644 --- a/ceph/doc/rados/configuration/storage-devices.rst +++ b/ceph/doc/rados/configuration/storage-devices.rst @@ -25,6 +25,7 @@ There are two Ceph daemons that store data on devices: additional monitoring and providing interfaces to external monitoring and management systems. +.. _rados_config_storage_devices_osd_backends: OSD Back Ends ============= diff --git a/ceph/doc/rados/operations/balancer.rst b/ceph/doc/rados/operations/balancer.rst index b02a8914d..e93b7e47b 100644 --- a/ceph/doc/rados/operations/balancer.rst +++ b/ceph/doc/rados/operations/balancer.rst @@ -3,14 +3,15 @@ Balancer ======== -The *balancer* can optimize the placement of PGs across OSDs in -order to achieve a balanced distribution, either automatically or in a -supervised fashion. +The *balancer* can optimize the allocation of placement groups (PGs) across +OSDs in order to achieve a balanced distribution. The balancer can operate +either automatically or in a supervised fashion. + Status ------ -The current status of the balancer can be checked at any time with: +To check the current status of the balancer, run the following command: .. prompt:: bash $ @@ -20,70 +21,78 @@ The current status of the balancer can be checked at any time with: Automatic balancing ------------------- -The automatic balancing feature is enabled by default in ``upmap`` -mode. Please refer to :ref:`upmap` for more details. The balancer can be -turned off with: +When the balancer is in ``upmap`` mode, the automatic balancing feature is +enabled by default. For more details, see :ref:`upmap`. To disable the +balancer, run the following command: .. prompt:: bash $ ceph balancer off -The balancer mode can be changed to ``crush-compat`` mode, which is -backward compatible with older clients, and will make small changes to -the data distribution over time to ensure that OSDs are equally utilized. +The balancer mode can be changed from ``upmap`` mode to ``crush-compat`` mode. +``crush-compat`` mode is backward compatible with older clients. In +``crush-compat`` mode, the balancer automatically makes small changes to the +data distribution in order to ensure that OSDs are utilized equally. Throttling ---------- -No adjustments will be made to the PG distribution if the cluster is -degraded (e.g., because an OSD has failed and the system has not yet -healed itself). +If the cluster is degraded (that is, if an OSD has failed and the system hasn't +healed itself yet), then the balancer will not make any adjustments to the PG +distribution. -When the cluster is healthy, the balancer will throttle its changes -such that the percentage of PGs that are misplaced (i.e., that need to -be moved) is below a threshold of (by default) 5%. The -``target_max_misplaced_ratio`` threshold can be adjusted with: +When the cluster is healthy, the balancer will incrementally move a small +fraction of unbalanced PGs in order to improve distribution. This fraction +will not exceed a certain threshold that defaults to 5%. To adjust this +``target_max_misplaced_ratio`` threshold setting, run the following command: .. prompt:: bash $ ceph config set mgr target_max_misplaced_ratio .07 # 7% -Set the number of seconds to sleep in between runs of the automatic balancer: +The balancer sleeps between runs. To set the number of seconds for this +interval of sleep, run the following command: .. prompt:: bash $ ceph config set mgr mgr/balancer/sleep_interval 60 -Set the time of day to begin automatic balancing in HHMM format: +To set the time of day (in HHMM format) at which automatic balancing begins, +run the following command: .. prompt:: bash $ ceph config set mgr mgr/balancer/begin_time 0000 -Set the time of day to finish automatic balancing in HHMM format: +To set the time of day (in HHMM format) at which automatic balancing ends, run +the following command: .. prompt:: bash $ ceph config set mgr mgr/balancer/end_time 2359 -Restrict automatic balancing to this day of the week or later. -Uses the same conventions as crontab, 0 is Sunday, 1 is Monday, and so on: +Automatic balancing can be restricted to certain days of the week. To restrict +it to a specific day of the week or later (as with crontab, ``0`` is Sunday, +``1`` is Monday, and so on), run the following command: .. prompt:: bash $ ceph config set mgr mgr/balancer/begin_weekday 0 -Restrict automatic balancing to this day of the week or earlier. -Uses the same conventions as crontab, 0 is Sunday, 1 is Monday, and so on: +To restrict automatic balancing to a specific day of the week or earlier +(again, ``0`` is Sunday, ``1`` is Monday, and so on), run the following +command: .. prompt:: bash $ ceph config set mgr mgr/balancer/end_weekday 6 -Pool IDs to which the automatic balancing will be limited. -The default for this is an empty string, meaning all pools will be balanced. -The numeric pool IDs can be gotten with the :command:`ceph osd pool ls detail` command: +Automatic balancing can be restricted to certain pools. By default, the value +of this setting is an empty string, so that all pools are automatically +balanced. To restrict automatic balancing to specific pools, retrieve their +numeric pool IDs (by running the :command:`ceph osd pool ls detail` command), +and then run the following command: .. prompt:: bash $ @@ -93,43 +102,41 @@ The numeric pool IDs can be gotten with the :command:`ceph osd pool ls detail` c Modes ----- -There are currently two supported balancer modes: +There are two supported balancer modes: -#. **crush-compat**. The CRUSH compat mode uses the compat weight-set - feature (introduced in Luminous) to manage an alternative set of - weights for devices in the CRUSH hierarchy. The normal weights - should remain set to the size of the device to reflect the target - amount of data that we want to store on the device. The balancer - then optimizes the weight-set values, adjusting them up or down in - small increments, in order to achieve a distribution that matches - the target distribution as closely as possible. (Because PG - placement is a pseudorandom process, there is a natural amount of - variation in the placement; by optimizing the weights we - counter-act that natural variation.) +#. **crush-compat**. This mode uses the compat weight-set feature (introduced + in Luminous) to manage an alternative set of weights for devices in the + CRUSH hierarchy. When the balancer is operating in this mode, the normal + weights should remain set to the size of the device in order to reflect the + target amount of data intended to be stored on the device. The balancer will + then optimize the weight-set values, adjusting them up or down in small + increments, in order to achieve a distribution that matches the target + distribution as closely as possible. (Because PG placement is a pseudorandom + process, it is subject to a natural amount of variation; optimizing the + weights serves to counteract that natural variation.) - Notably, this mode is *fully backwards compatible* with older - clients: when an OSDMap and CRUSH map is shared with older clients, - we present the optimized weights as the "real" weights. + Note that this mode is *fully backward compatible* with older clients: when + an OSD Map and CRUSH map are shared with older clients, Ceph presents the + optimized weights as the "real" weights. - The primary restriction of this mode is that the balancer cannot - handle multiple CRUSH hierarchies with different placement rules if - the subtrees of the hierarchy share any OSDs. (This is normally - not the case, and is generally not a recommended configuration - because it is hard to manage the space utilization on the shared - OSDs.) + The primary limitation of this mode is that the balancer cannot handle + multiple CRUSH hierarchies with different placement rules if the subtrees of + the hierarchy share any OSDs. (Such sharing of OSDs is not typical and, + because of the difficulty of managing the space utilization on the shared + OSDs, is generally not recommended.) -#. **upmap**. Starting with Luminous, the OSDMap can store explicit - mappings for individual OSDs as exceptions to the normal CRUSH - placement calculation. These `upmap` entries provide fine-grained - control over the PG mapping. This CRUSH mode will optimize the - placement of individual PGs in order to achieve a balanced - distribution. In most cases, this distribution is "perfect," which - an equal number of PGs on each OSD (+/-1 PG, since they might not - divide evenly). +#. **upmap**. In Luminous and later releases, the OSDMap can store explicit + mappings for individual OSDs as exceptions to the normal CRUSH placement + calculation. These ``upmap`` entries provide fine-grained control over the + PG mapping. This balancer mode optimizes the placement of individual PGs in + order to achieve a balanced distribution. In most cases, the resulting + distribution is nearly perfect: that is, there is an equal number of PGs on + each OSD (±1 PG, since the total number might not divide evenly). - Note that using upmap requires that all clients be Luminous or newer. + To use``upmap``, all clients must be Luminous or newer. -The default mode is ``upmap``. The mode can be adjusted with: +The default mode is ``upmap``. The mode can be changed to ``crush-compat`` by +running the following command: .. prompt:: bash $ @@ -138,69 +145,77 @@ The default mode is ``upmap``. The mode can be adjusted with: Supervised optimization ----------------------- -The balancer operation is broken into a few distinct phases: +Supervised use of the balancer can be understood in terms of three distinct +phases: -#. building a *plan* -#. evaluating the quality of the data distribution, either for the current PG distribution, or the PG distribution that would result after executing a *plan* -#. executing the *plan* +#. building a plan +#. evaluating the quality of the data distribution, either for the current PG + distribution or for the PG distribution that would result after executing a + plan +#. executing the plan -To evaluate and score the current distribution: +To evaluate the current distribution, run the following command: .. prompt:: bash $ ceph balancer eval -You can also evaluate the distribution for a single pool with: +To evaluate the distribution for a single pool, run the following command: .. prompt:: bash $ ceph balancer eval -Greater detail for the evaluation can be seen with: +To see the evaluation in greater detail, run the following command: .. prompt:: bash $ ceph balancer eval-verbose ... - -The balancer can generate a plan, using the currently configured mode, with: + +To instruct the balancer to generate a plan (using the currently configured +mode), make up a name (any useful identifying string) for the plan, and run the +following command: .. prompt:: bash $ ceph balancer optimize -The name is provided by the user and can be any useful identifying string. The contents of a plan can be seen with: +To see the contents of a plan, run the following command: .. prompt:: bash $ ceph balancer show -All plans can be shown with: +To display all plans, run the following command: .. prompt:: bash $ ceph balancer ls -Old plans can be discarded with: +To discard an old plan, run the following command: .. prompt:: bash $ ceph balancer rm -Currently recorded plans are shown as part of the status command: +To see currently recorded plans, examine the output of the following status +command: .. prompt:: bash $ ceph balancer status -The quality of the distribution that would result after executing a plan can be calculated with: +To evaluate the distribution that would result from executing a specific plan, +run the following command: .. prompt:: bash $ ceph balancer eval -Assuming the plan is expected to improve the distribution (i.e., it has a lower score than the current cluster state), the user can execute that plan with: +If a plan is expected to improve the distribution (that is, the plan's score is +lower than the current cluster state's score), you can execute that plan by +running the following command: .. prompt:: bash $ ceph balancer execute - diff --git a/ceph/doc/rados/operations/bluestore-migration.rst b/ceph/doc/rados/operations/bluestore-migration.rst index 7cee07156..6324f775b 100644 --- a/ceph/doc/rados/operations/bluestore-migration.rst +++ b/ceph/doc/rados/operations/bluestore-migration.rst @@ -1,3 +1,5 @@ +.. _rados_operations_bluestore_migration: + ===================== BlueStore Migration ===================== diff --git a/ceph/doc/rados/operations/cache-tiering.rst b/ceph/doc/rados/operations/cache-tiering.rst index 8056ace47..cd29b87d0 100644 --- a/ceph/doc/rados/operations/cache-tiering.rst +++ b/ceph/doc/rados/operations/cache-tiering.rst @@ -1,6 +1,10 @@ =============== Cache Tiering =============== +.. warning:: Cache tiering has been deprecated in the Reef release as it + has lacked a maintainer for a very long time. This does not mean + it will be certainly removed, but we may choose to remove it + without much further notice. A cache tier provides Ceph Clients with better I/O performance for a subset of the data stored in a backing storage tier. Cache tiering involves creating a diff --git a/ceph/doc/rados/operations/crush-map.rst b/ceph/doc/rados/operations/crush-map.rst index f22ebb24e..79a27adf2 100644 --- a/ceph/doc/rados/operations/crush-map.rst +++ b/ceph/doc/rados/operations/crush-map.rst @@ -315,7 +315,7 @@ the hierarchy is visible as a separate column (labeled either .. prompt:: bash $ - ceph osd tree + ceph osd crush tree When both *compat* and *per-pool* weight sets are in use, data placement for a particular pool will use its own per-pool weight set diff --git a/ceph/doc/rados/operations/data-placement.rst b/ceph/doc/rados/operations/data-placement.rst index bd9bd7ec7..5e9b6d25d 100644 --- a/ceph/doc/rados/operations/data-placement.rst +++ b/ceph/doc/rados/operations/data-placement.rst @@ -2,40 +2,44 @@ Data Placement Overview ========================= -Ceph stores, replicates and rebalances data objects across a RADOS cluster -dynamically. With many different users storing objects in different pools for -different purposes on countless OSDs, Ceph operations require some data -placement planning. The main data placement planning concepts in Ceph include: +Ceph stores, replicates, and rebalances data objects across a RADOS cluster +dynamically. Because different users store objects in different pools for +different purposes on many OSDs, Ceph operations require a certain amount of +data- placement planning. The main data-placement planning concepts in Ceph +include: -- **Pools:** Ceph stores data within pools, which are logical groups for storing - objects. Pools manage the number of placement groups, the number of replicas, - and the CRUSH rule for the pool. To store data in a pool, you must have - an authenticated user with permissions for the pool. Ceph can snapshot pools. - See `Pools`_ for additional details. +- **Pools:** Ceph stores data within pools, which are logical groups used for + storing objects. Pools manage the number of placement groups, the number of + replicas, and the CRUSH rule for the pool. To store data in a pool, it is + necessary to be an authenticated user with permissions for the pool. Ceph is + able to make snapshots of pools. For additional details, see `Pools`_. -- **Placement Groups:** Ceph maps objects to placement groups (PGs). - Placement groups (PGs) are shards or fragments of a logical object pool - that place objects as a group into OSDs. Placement groups reduce the amount - of per-object metadata when Ceph stores the data in OSDs. A larger number of - placement groups (e.g., 100 per OSD) leads to better balancing. See - `Placement Groups`_ for additional details. +- **Placement Groups:** Ceph maps objects to placement groups. Placement + groups (PGs) are shards or fragments of a logical object pool that place + objects as a group into OSDs. Placement groups reduce the amount of + per-object metadata that is necessary for Ceph to store the data in OSDs. A + greater number of placement groups (for example, 100 PGs per OSD as compared + with 50 PGs per OSD) leads to better balancing. -- **CRUSH Maps:** CRUSH is a big part of what allows Ceph to scale without - performance bottlenecks, without limitations to scalability, and without a - single point of failure. CRUSH maps provide the physical topology of the - cluster to the CRUSH algorithm to determine where the data for an object - and its replicas should be stored, and how to do so across failure domains - for added data safety among other things. See `CRUSH Maps`_ for additional - details. +- **CRUSH Maps:** CRUSH plays a major role in allowing Ceph to scale while + avoiding certain pitfalls, such as performance bottlenecks, limitations to + scalability, and single points of failure. CRUSH maps provide the physical + topology of the cluster to the CRUSH algorithm, so that it can determine both + (1) where the data for an object and its replicas should be stored and (2) + how to store that data across failure domains so as to improve data safety. + For additional details, see `CRUSH Maps`_. -- **Balancer:** The balancer is a feature that will automatically optimize the - distribution of PGs across devices to achieve a balanced data distribution, - maximizing the amount of data that can be stored in the cluster and evenly - distributing the workload across OSDs. +- **Balancer:** The balancer is a feature that automatically optimizes the + distribution of placement groups across devices in order to achieve a + balanced data distribution, in order to maximize the amount of data that can + be stored in the cluster, and in order to evenly distribute the workload + across OSDs. -When you initially set up a test cluster, you can use the default values. Once -you begin planning for a large Ceph cluster, refer to pools, placement groups -and CRUSH for data placement operations. +It is possible to use the default values for each of the above components. +Default values are recommended for a test cluster's initial setup. However, +when planning a large Ceph cluster, values should be customized for +data-placement operations with reference to the different roles played by +pools, placement groups, and CRUSH. .. _Pools: ../pools .. _Placement Groups: ../placement-groups diff --git a/ceph/doc/rados/operations/devices.rst b/ceph/doc/rados/operations/devices.rst index 1b6eaebde..f92f622d5 100644 --- a/ceph/doc/rados/operations/devices.rst +++ b/ceph/doc/rados/operations/devices.rst @@ -3,28 +3,32 @@ Device Management ================= -Ceph tracks which hardware storage devices (e.g., HDDs, SSDs) are consumed by -which daemons, and collects health metrics about those devices in order to -provide tools to predict and/or automatically respond to hardware failure. +Device management allows Ceph to address hardware failure. Ceph tracks hardware +storage devices (HDDs, SSDs) to see which devices are managed by which daemons. +Ceph also collects health metrics about these devices. By doing so, Ceph can +provide tools that predict hardware failure and can automatically respond to +hardware failure. Device tracking --------------- -You can query which storage devices are in use with: +To see a list of the storage devices that are in use, run the following +command: .. prompt:: bash $ ceph device ls -You can also list devices by daemon or by host: +Alternatively, to list devices by daemon or by host, run a command of one of +the following forms: .. prompt:: bash $ ceph device ls-by-daemon ceph device ls-by-host -For any individual device, you can query information about its -location and how it is being consumed with: +To see information about the location of an specific device and about how the +device is being consumed, run a command of the following form: .. prompt:: bash $ @@ -33,103 +37,107 @@ location and how it is being consumed with: Identifying physical devices ---------------------------- -You can blink the drive LEDs on hardware enclosures to make the replacement of -failed disks easy and less error-prone. Use the following command:: +To make the replacement of failed disks easier and less error-prone, you can +(in some cases) "blink" the drive's LEDs on hardware enclosures by running a +command of the following form:: device light on|off [ident|fault] [--force] -The ```` parameter is the device identification. You can obtain this -information using the following command: +.. note:: Using this command to blink the lights might not work. Whether it + works will depend upon such factors as your kernel revision, your SES + firmware, or the setup of your HBA. + +The ```` parameter is the device identification. To retrieve this +information, run the following command: .. prompt:: bash $ ceph device ls -The ``[ident|fault]`` parameter is used to set the kind of light to blink. -By default, the `identification` light is used. +The ``[ident|fault]`` parameter determines which kind of light will blink. By +default, the `identification` light is used. -.. note:: - This command needs the Cephadm or the Rook `orchestrator `_ module enabled. - The orchestrator module enabled is shown by executing the following command: +.. note:: This command works only if the Cephadm or the Rook `orchestrator + `_ + module is enabled. To see which orchestrator module is enabled, run the + following command: .. prompt:: bash $ ceph orch status -The command behind the scene to blink the drive LEDs is `lsmcli`. If you need -to customize this command you can configure this via a Jinja2 template:: +The command that makes the drive's LEDs blink is `lsmcli`. To customize this +command, configure it via a Jinja2 template by running commands of the +following forms:: ceph config-key set mgr/cephadm/blink_device_light_cmd "