mirror of
https://git.proxmox.com/git/ceph.git
synced 2025-04-28 10:45:26 +00:00
update ceph source to reef 18.2.0
Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
This commit is contained in:
parent
ab27109dd2
commit
05a536ef04
@ -1,7 +1,7 @@
|
||||
cmake_minimum_required(VERSION 3.16)
|
||||
|
||||
project(ceph
|
||||
VERSION 18.1.2
|
||||
VERSION 18.2.0
|
||||
LANGUAGES CXX C ASM)
|
||||
|
||||
cmake_policy(SET CMP0028 NEW)
|
||||
|
@ -77,7 +77,11 @@
|
||||
map and unmap images in namespaces using the `image-spec` syntax since then
|
||||
but the corresponding option available in most other commands was missing.
|
||||
* RGW: Compression is now supported for objects uploaded with Server-Side Encryption.
|
||||
When both are enabled, compression is applied before encryption.
|
||||
When both are enabled, compression is applied before encryption. Earlier releases
|
||||
of multisite do not replicate such objects correctly, so all zones must upgrade to
|
||||
Reef before enabling the `compress-encrypted` zonegroup feature: see
|
||||
https://docs.ceph.com/en/reef/radosgw/multisite/#zone-features and note the
|
||||
security considerations.
|
||||
* RGW: the "pubsub" functionality for storing bucket notifications inside Ceph
|
||||
is removed. Together with it, the "pubsub" zone should not be used anymore.
|
||||
The REST operations, as well as radosgw-admin commands for manipulating
|
||||
@ -124,6 +128,9 @@
|
||||
* RBD: list-watchers C++ API (`Image::list_watchers`) now clears the passed
|
||||
`std::list` before potentially appending to it, aligning with the semantics
|
||||
of the corresponding C API (`rbd_watchers_list`).
|
||||
* The rados python binding is now able to process (opt-in) omap keys as bytes
|
||||
objects. This enables interacting with RADOS omap keys that are not decodeable as
|
||||
UTF-8 strings.
|
||||
* Telemetry: Users who are opted-in to telemetry can also opt-in to
|
||||
participating in a leaderboard in the telemetry public
|
||||
dashboards (https://telemetry-public.ceph.com/). Users can now also add a
|
||||
@ -162,6 +169,8 @@
|
||||
fixes and enhancements.
|
||||
* For more detailed information see:
|
||||
https://docs.ceph.com/en/reef/rados/configuration/mclock-config-ref/
|
||||
* CEPHFS: After recovering a Ceph File System post following the disaster recovery
|
||||
procedure, the recovered files under `lost+found` directory can now be deleted.
|
||||
|
||||
>=17.2.1
|
||||
|
||||
|
@ -23,33 +23,49 @@ contributed under the terms of the applicable license.
|
||||
|
||||
## Checking out the source
|
||||
|
||||
You can clone from github with
|
||||
Clone the ceph/ceph repository from github by running the following command on
|
||||
a system that has git installed:
|
||||
|
||||
git clone git@github.com:ceph/ceph
|
||||
|
||||
or, if you are not a github user,
|
||||
Alternatively, if you are not a github user, you should run the following
|
||||
command on a system that has git installed:
|
||||
|
||||
git clone https://github.com/ceph/ceph.git
|
||||
|
||||
Ceph contains many git submodules that need to be checked out with
|
||||
When the ceph/ceph repository has been cloned to your system, run the following
|
||||
command to check out the git submodules associated with the ceph/ceph
|
||||
repository:
|
||||
|
||||
git submodule update --init --recursive
|
||||
|
||||
|
||||
## Build Prerequisites
|
||||
|
||||
The list of Debian or RPM packages dependencies can be installed with:
|
||||
*section last updated 27 Jul 2023*
|
||||
|
||||
Make sure that ``curl`` is installed. The Debian and Ubuntu ``apt`` command is
|
||||
provided here, but if you use a system with a different package manager, then
|
||||
you must use whatever command is the proper counterpart of this one:
|
||||
|
||||
apt install curl
|
||||
|
||||
Install Debian or RPM package dependencies by running the following command:
|
||||
|
||||
./install-deps.sh
|
||||
|
||||
Install the ``python3-routes`` package:
|
||||
|
||||
apt install python3-routes
|
||||
|
||||
|
||||
## Building Ceph
|
||||
|
||||
Note that these instructions are meant for developers who are
|
||||
compiling the code for development and testing. To build binaries
|
||||
suitable for installation we recommend you build deb or rpm packages
|
||||
or refer to the `ceph.spec.in` or `debian/rules` to see which
|
||||
configuration options are specified for production builds.
|
||||
These instructions are meant for developers who are compiling the code for
|
||||
development and testing. To build binaries that are suitable for installation
|
||||
we recommend that you build .deb or .rpm packages, or refer to ``ceph.spec.in``
|
||||
or ``debian/rules`` to see which configuration options are specified for
|
||||
production builds.
|
||||
|
||||
Build instructions:
|
||||
|
||||
@ -57,21 +73,20 @@ Build instructions:
|
||||
cd build
|
||||
ninja
|
||||
|
||||
(do_cmake.sh now defaults to creating a debug build of ceph that can
|
||||
be up to 5x slower with some workloads. Please pass
|
||||
"-DCMAKE_BUILD_TYPE=RelWithDebInfo" to do_cmake.sh to create a non-debug
|
||||
release.
|
||||
``do_cmake.sh`` defaults to creating a debug build of Ceph that can be up to 5x
|
||||
slower with some workloads. Pass ``-DCMAKE_BUILD_TYPE=RelWithDebInfo`` to
|
||||
``do_cmake.sh`` to create a non-debug release.
|
||||
|
||||
The number of jobs used by `ninja` is derived from the number of CPU cores of
|
||||
the building host if unspecified. Use the `-j` option to limit the job number
|
||||
if the build jobs are running out of memory. On average, each job takes around
|
||||
2.5GiB memory.)
|
||||
2.5GiB memory.
|
||||
|
||||
This assumes you make your build dir a subdirectory of the ceph.git
|
||||
This assumes that you make your build directory a subdirectory of the ceph.git
|
||||
checkout. If you put it elsewhere, just point `CEPH_GIT_DIR` to the correct
|
||||
path to the checkout. Any additional CMake args can be specified by setting ARGS
|
||||
before invoking do_cmake. See [cmake options](#cmake-options)
|
||||
for more details. Eg.
|
||||
path to the checkout. Additional CMake args can be specified by setting ARGS
|
||||
before invoking ``do_cmake.sh``. See [cmake options](#cmake-options)
|
||||
for more details. For example:
|
||||
|
||||
ARGS="-DCMAKE_C_COMPILER=gcc-7" ./do_cmake.sh
|
||||
|
||||
|
@ -170,7 +170,7 @@
|
||||
# main package definition
|
||||
#################################################################################
|
||||
Name: ceph
|
||||
Version: 18.1.2
|
||||
Version: 18.2.0
|
||||
Release: 0%{?dist}
|
||||
%if 0%{?fedora} || 0%{?rhel}
|
||||
Epoch: 2
|
||||
@ -186,7 +186,7 @@ License: LGPL-2.1 and LGPL-3.0 and CC-BY-SA-3.0 and GPL-2.0 and BSL-1.0 and BSD-
|
||||
Group: System/Filesystems
|
||||
%endif
|
||||
URL: http://ceph.com/
|
||||
Source0: %{?_remote_tarball_prefix}ceph-18.1.2.tar.bz2
|
||||
Source0: %{?_remote_tarball_prefix}ceph-18.2.0.tar.bz2
|
||||
%if 0%{?suse_version}
|
||||
# _insert_obs_source_lines_here
|
||||
ExclusiveArch: x86_64 aarch64 ppc64le s390x
|
||||
@ -1292,7 +1292,7 @@ This package provides a Ceph MIB for SNMP traps.
|
||||
# common
|
||||
#################################################################################
|
||||
%prep
|
||||
%autosetup -p1 -n ceph-18.1.2
|
||||
%autosetup -p1 -n ceph-18.2.0
|
||||
|
||||
%build
|
||||
# Disable lto on systems that do not support symver attribute
|
||||
|
@ -1,7 +1,19 @@
|
||||
ceph (18.1.2-1jammy) jammy; urgency=medium
|
||||
ceph (18.2.0-1jammy) jammy; urgency=medium
|
||||
|
||||
|
||||
-- Jenkins Build Slave User <jenkins-build@braggi17.front.sepia.ceph.com> Tue, 27 Jun 2023 20:13:15 +0000
|
||||
-- Jenkins Build Slave User <jenkins-build@braggi17.front.sepia.ceph.com> Thu, 03 Aug 2023 18:57:50 +0000
|
||||
|
||||
ceph (18.2.0-1) stable; urgency=medium
|
||||
|
||||
* New upstream release
|
||||
|
||||
-- Ceph Release Team <ceph-maintainers@ceph.io> Thu, 03 Aug 2023 16:53:10 +0000
|
||||
|
||||
ceph (18.1.3-1) rc; urgency=medium
|
||||
|
||||
* New upstream release
|
||||
|
||||
-- Ceph Release Team <ceph-maintainers@ceph.io> Tue, 25 Jul 2023 02:48:09 +0000
|
||||
|
||||
ceph (18.1.2-1) rc; urgency=medium
|
||||
|
||||
|
@ -1 +1,3 @@
|
||||
lib/systemd/system/cephfs-mirror*
|
||||
usr/bin/cephfs-mirror
|
||||
usr/share/man/man8/cephfs-mirror.8
|
||||
|
@ -15,6 +15,10 @@ There are important considerations when planning these pools:
|
||||
- We recommend the fastest feasible low-latency storage devices (NVMe, Optane,
|
||||
or at the very least SAS/SATA SSD) for the metadata pool, as this will
|
||||
directly affect the latency of client file system operations.
|
||||
- We strongly suggest that the CephFS metadata pool be provisioned on dedicated
|
||||
SSD / NVMe OSDs. This ensures that high client workload does not adversely
|
||||
impact metadata operations. See :ref:`device_classes` to configure pools this
|
||||
way.
|
||||
- The data pool used to create the file system is the "default" data pool and
|
||||
the location for storing all inode backtrace information, which is used for hard link
|
||||
management and disaster recovery. For this reason, all CephFS inodes
|
||||
|
@ -57,6 +57,8 @@
|
||||
.. confval:: mds_kill_import_at
|
||||
.. confval:: mds_kill_link_at
|
||||
.. confval:: mds_kill_rename_at
|
||||
.. confval:: mds_inject_skip_replaying_inotable
|
||||
.. confval:: mds_kill_skip_replaying_inotable
|
||||
.. confval:: mds_wipe_sessions
|
||||
.. confval:: mds_wipe_ino_prealloc
|
||||
.. confval:: mds_skip_ino
|
||||
|
@ -21,6 +21,133 @@ We can get hints about what's going on by dumping the MDS cache ::
|
||||
If high logging levels are set on the MDS, that will almost certainly hold the
|
||||
information we need to diagnose and solve the issue.
|
||||
|
||||
Stuck during recovery
|
||||
=====================
|
||||
|
||||
Stuck in up:replay
|
||||
------------------
|
||||
|
||||
If your MDS is stuck in ``up:replay`` then it is likely that the journal is
|
||||
very long. Did you see ``MDS_HEALTH_TRIM`` cluster warnings saying the MDS is
|
||||
behind on trimming its journal? If the journal has grown very large, it can
|
||||
take hours to read the journal. There is no working around this but there
|
||||
are things you can do to speed things along:
|
||||
|
||||
Reduce MDS debugging to 0. Even at the default settings, the MDS logs some
|
||||
messages to memory for dumping if a fatal error is encountered. You can avoid
|
||||
this:
|
||||
|
||||
.. code:: bash
|
||||
|
||||
ceph config set mds debug_mds 0
|
||||
ceph config set mds debug_ms 0
|
||||
ceph config set mds debug_monc 0
|
||||
|
||||
Note if the MDS fails then there will be virtually no information to determine
|
||||
why. If you can calculate when ``up:replay`` will complete, you should restore
|
||||
these configs just prior to entering the next state:
|
||||
|
||||
.. code:: bash
|
||||
|
||||
ceph config rm mds debug_mds
|
||||
ceph config rm mds debug_ms
|
||||
ceph config rm mds debug_monc
|
||||
|
||||
Once you've got replay moving along faster, you can calculate when the MDS will
|
||||
complete. This is done by examining the journal replay status:
|
||||
|
||||
.. code:: bash
|
||||
|
||||
$ ceph tell mds.<fs_name>:0 status | jq .replay_status
|
||||
{
|
||||
"journal_read_pos": 4195244,
|
||||
"journal_write_pos": 4195244,
|
||||
"journal_expire_pos": 4194304,
|
||||
"num_events": 2,
|
||||
"num_segments": 2
|
||||
}
|
||||
|
||||
Replay completes when the ``journal_read_pos`` reaches the
|
||||
``journal_write_pos``. The write position will not change during replay. Track
|
||||
the progression of the read position to compute the expected time to complete.
|
||||
|
||||
|
||||
Avoiding recovery roadblocks
|
||||
----------------------------
|
||||
|
||||
When trying to urgently restore your file system during an outage, here are some
|
||||
things to do:
|
||||
|
||||
* **Deny all reconnect to clients.** This effectively blocklists all existing
|
||||
CephFS sessions so all mounts will hang or become unavailable.
|
||||
|
||||
.. code:: bash
|
||||
|
||||
ceph config set mds mds_deny_all_reconnect true
|
||||
|
||||
Remember to undo this after the MDS becomes active.
|
||||
|
||||
.. note:: This does not prevent new sessions from connecting. For that, see the ``refuse_client_session`` file system setting.
|
||||
|
||||
* **Extend the MDS heartbeat grace period**. This avoids replacing an MDS that appears
|
||||
"stuck" doing some operation. Sometimes recovery of an MDS may involve an
|
||||
operation that may take longer than expected (from the programmer's
|
||||
perspective). This is more likely when recovery is already taking a longer than
|
||||
normal amount of time to complete (indicated by your reading this document).
|
||||
Avoid unnecessary replacement loops by extending the heartbeat graceperiod:
|
||||
|
||||
.. code:: bash
|
||||
|
||||
ceph config set mds mds_heartbeat_reset_grace 3600
|
||||
|
||||
This has the effect of having the MDS continue to send beacons to the monitors
|
||||
even when its internal "heartbeat" mechanism has not been reset (beat) in one
|
||||
hour. Note the previous mechanism for achieving this was via the
|
||||
`mds_beacon_grace` monitor setting.
|
||||
|
||||
* **Disable open file table prefetch.** Normally, the MDS will prefetch
|
||||
directory contents during recovery to heat up its cache. During long
|
||||
recovery, the cache is probably already hot **and large**. So this behavior
|
||||
can be undesirable. Disable using:
|
||||
|
||||
.. code:: bash
|
||||
|
||||
ceph config set mds mds_oft_prefetch_dirfrags false
|
||||
|
||||
* **Turn off clients.** Clients reconnecting to the newly ``up:active`` MDS may
|
||||
cause new load on the file system when it's just getting back on its feet.
|
||||
There will likely be some general maintenance to do before workloads should be
|
||||
resumed. For example, expediting journal trim may be advisable if the recovery
|
||||
took a long time because replay was reading a overly large journal.
|
||||
|
||||
You can do this manually or use the new file system tunable:
|
||||
|
||||
.. code:: bash
|
||||
|
||||
ceph fs set <fs_name> refuse_client_session true
|
||||
|
||||
That prevents any clients from establishing new sessions with the MDS.
|
||||
|
||||
|
||||
|
||||
Expediting MDS journal trim
|
||||
===========================
|
||||
|
||||
If your MDS journal grew too large (maybe your MDS was stuck in up:replay for a
|
||||
long time!), you will want to have the MDS trim its journal more frequently.
|
||||
You will know the journal is too large because of ``MDS_HEALTH_TRIM`` warnings.
|
||||
|
||||
The main tunable available to do this is to modify the MDS tick interval. The
|
||||
"tick" interval drives several upkeep activities in the MDS. It is strongly
|
||||
recommended no significant file system load be present when modifying this tick
|
||||
interval. This setting only affects an MDS in ``up:active``. The MDS does not
|
||||
trim its journal during recovery.
|
||||
|
||||
.. code:: bash
|
||||
|
||||
ceph config set mds mds_tick_interval 2
|
||||
|
||||
|
||||
RADOS Health
|
||||
============
|
||||
|
||||
|
@ -3,9 +3,74 @@ Serialization (encode/decode)
|
||||
=============================
|
||||
|
||||
When a structure is sent over the network or written to disk, it is
|
||||
encoded into a string of bytes. Serializable structures have
|
||||
``encode`` and ``decode`` methods that write and read from ``bufferlist``
|
||||
objects representing byte strings.
|
||||
encoded into a string of bytes. Usually (but not always -- multiple
|
||||
serialization facilities coexist in Ceph) serializable structures
|
||||
have ``encode`` and ``decode`` methods that write and read from
|
||||
``bufferlist`` objects representing byte strings.
|
||||
|
||||
Terminology
|
||||
-----------
|
||||
It is best to think not in the domain of daemons and clients but
|
||||
encoders and decoders. An encoder serializes a structure into a bufferlist
|
||||
while a decoder does the opposite.
|
||||
|
||||
Encoders and decoders can be referred collectively as dencoders.
|
||||
|
||||
Dencoders (both encoders and docoders) live within daemons and clients.
|
||||
For instance, when an RBD client issues an IO operation, it prepares
|
||||
an instance of the ``MOSDOp`` structure and encodes it into a bufferlist
|
||||
that is put on the wire.
|
||||
An OSD reads these bytes and decodes them back into an ``MOSDOp`` instance.
|
||||
Here encoder was used by the client while decoder by the OSD. However,
|
||||
these roles can swing -- just imagine handling of the response: OSD encodes
|
||||
the ``MOSDOpReply`` while RBD clients decode.
|
||||
|
||||
Encoder and decoder operate accordingly to a format which is defined
|
||||
by a programmer by implementing the ``encode`` and ``decode`` methods.
|
||||
|
||||
Principles for format change
|
||||
----------------------------
|
||||
It is not unusual that the format of serialization changes. This
|
||||
process requires careful attention from during both development
|
||||
and review.
|
||||
|
||||
The general rule is that a decoder must understand what had been
|
||||
encoded by an encoder. Most of the problems come from ensuring
|
||||
that compatibility continues between old decoders and new encoders
|
||||
as well as new decoders and old decoders. One should assume
|
||||
that -- if not otherwise derogated -- any mix (old/new) is
|
||||
possible in a cluster. There are 2 main reasons for that:
|
||||
|
||||
1. Upgrades. Although there are recommendations related to the order
|
||||
of entity types (mons/osds/clients), it is not mandatory and
|
||||
no assumption should be made about it.
|
||||
2. Huge variability of client versions. It was always the case
|
||||
that kernel (and thus kernel clients) upgrades are decoupled
|
||||
from Ceph upgrades. Moreover, proliferation of containerization
|
||||
bring the variability even to e.g. ``librbd`` -- now user space
|
||||
libraries live on the container own.
|
||||
|
||||
With this being said, there are few rules limiting the degree
|
||||
of interoperability between dencoders:
|
||||
|
||||
* ``n-2`` for dencoding between daemons,
|
||||
* ``n-3`` hard requirement for client-involved scenarios,
|
||||
* ``n-3..`` soft requirements for clinet-involved scenarios. Ideally
|
||||
every client should be able to talk any version of daemons.
|
||||
|
||||
As the underlying reasons are the same, the rules dencoders
|
||||
follow are virtually the same as for deprecations of our features
|
||||
bits. See the ``Notes on deprecation`` in ``src/include/ceph_features.h``.
|
||||
|
||||
Frameworks
|
||||
----------
|
||||
Currently multiple genres of dencoding helpers co-exist.
|
||||
|
||||
* encoding.h (the most proliferated one),
|
||||
* denc.h (performance optimized, seen mostly in ``BlueStore``),
|
||||
* the `Message` hierarchy.
|
||||
|
||||
Although details vary, the interoperability rules stay the same.
|
||||
|
||||
Adding a field to a structure
|
||||
-----------------------------
|
||||
@ -93,3 +158,69 @@ because we might still be passed older-versioned messages that do not
|
||||
have the field. The ``struct_v`` variable is a local set by the ``DECODE_START``
|
||||
macro.
|
||||
|
||||
# Into the weeeds
|
||||
|
||||
The append-extendability of our dencoders is a result of the forward
|
||||
compatibility that the ``ENCODE_START`` and ``DECODE_FINISH`` macros bring.
|
||||
|
||||
They are implementing extendibility facilities. An encoder, when filling
|
||||
the bufferlist, prepends three fields: version of the current format,
|
||||
minimal version of a decoder compatible with it and the total size of
|
||||
all encoded fields.
|
||||
|
||||
.. code-block:: cpp
|
||||
|
||||
/**
|
||||
* start encoding block
|
||||
*
|
||||
* @param v current (code) version of the encoding
|
||||
* @param compat oldest code version that can decode it
|
||||
* @param bl bufferlist to encode to
|
||||
*
|
||||
*/
|
||||
#define ENCODE_START(v, compat, bl) \
|
||||
__u8 struct_v = v; \
|
||||
__u8 struct_compat = compat; \
|
||||
ceph_le32 struct_len; \
|
||||
auto filler = (bl).append_hole(sizeof(struct_v) + \
|
||||
sizeof(struct_compat) + sizeof(struct_len)); \
|
||||
const auto starting_bl_len = (bl).length(); \
|
||||
using ::ceph::encode; \
|
||||
do {
|
||||
|
||||
The ``struct_len`` field allows the decoder to eat all the bytes that were
|
||||
left undecoded in the user-provided ``decode`` implementation.
|
||||
Analogically, decoders tracks how much input has been decoded in the
|
||||
user-provided ``decode`` methods.
|
||||
|
||||
.. code-block:: cpp
|
||||
|
||||
#define DECODE_START(bl) \
|
||||
unsigned struct_end = 0; \
|
||||
__u32 struct_len; \
|
||||
decode(struct_len, bl); \
|
||||
... \
|
||||
struct_end = bl.get_off() + struct_len; \
|
||||
} \
|
||||
do {
|
||||
|
||||
|
||||
Decoder uses this information to discard the extra bytes it does not
|
||||
understand. Advancing bufferlist is critical as dencoders tend to be nested;
|
||||
just leaving it intact would work only for the very last ``deocde`` call
|
||||
in a nested structure.
|
||||
|
||||
.. code-block:: cpp
|
||||
|
||||
#define DECODE_FINISH(bl) \
|
||||
} while (false); \
|
||||
if (struct_end) { \
|
||||
... \
|
||||
if (bl.get_off() < struct_end) \
|
||||
bl += struct_end - bl.get_off(); \
|
||||
}
|
||||
|
||||
|
||||
This entire, cooperative mechanism allows encoder (its further revisions)
|
||||
to generate more byte stream (due to e.g. adding a new field at the end)
|
||||
and not worry that the residue will crash older decoder revisions.
|
||||
|
@ -16,32 +16,6 @@ mgr module
|
||||
The following diagrams outline the involved parties and how the interact when the clients
|
||||
query for the reports:
|
||||
|
||||
.. seqdiag::
|
||||
|
||||
seqdiag {
|
||||
default_note_color = lightblue;
|
||||
osd; mon; ceph-cli;
|
||||
osd => mon [ label = "update osdmap service" ];
|
||||
osd => mon [ label = "update osdmap service" ];
|
||||
ceph-cli -> mon [ label = "send 'health' command" ];
|
||||
mon -> mon [ leftnote = "gather checks from services" ];
|
||||
ceph-cli <-- mon [ label = "checks and mutes" ];
|
||||
}
|
||||
|
||||
.. seqdiag::
|
||||
|
||||
seqdiag {
|
||||
default_note_color = lightblue;
|
||||
osd; mon; mgr; mgr-module;
|
||||
mgr -> mon [ label = "subscribe for 'mgrdigest'" ];
|
||||
osd => mon [ label = "update osdmap service" ];
|
||||
osd => mon [ label = "update osdmap service" ];
|
||||
mon -> mgr [ label = "send MMgrDigest" ];
|
||||
mgr -> mgr [ note = "update cluster state" ];
|
||||
mon <-- mgr;
|
||||
mgr-module -> mgr [ label = "mgr.get('health')" ];
|
||||
mgr-module <-- mgr [ label = "heath reports in json" ];
|
||||
}
|
||||
|
||||
Where are the Reports Generated
|
||||
===============================
|
||||
@ -68,19 +42,6 @@ later loaded and decoded, so they can be collected on demand. When it comes to
|
||||
``MDSMonitor``, it persists the health metrics in the beacon sent by the MDS daemons,
|
||||
and prepares health reports when storing the pending changes.
|
||||
|
||||
.. seqdiag::
|
||||
|
||||
seqdiag {
|
||||
default_note_color = lightblue;
|
||||
mds; mon-mds; mon-health; ceph-cli;
|
||||
mds -> mon-mds [ label = "send beacon" ];
|
||||
mon-mds -> mon-mds [ note = "store health metrics in beacon" ];
|
||||
mds <-- mon-mds;
|
||||
mon-mds -> mon-mds [ note = "encode_health(checks)" ];
|
||||
ceph-cli -> mon-health [ label = "send 'health' command" ];
|
||||
mon-health => mon-mds [ label = "gather health checks" ];
|
||||
ceph-cli <-- mon-health [ label = "checks and mutes" ];
|
||||
}
|
||||
|
||||
So, if we want to add a new warning related to cephfs, probably the best place to
|
||||
start is ``MDSMonitor::encode_pending()``, where health reports are collected from
|
||||
@ -106,23 +67,3 @@ metrics and status to mgr using ``MMgrReport``. On the mgr side, it periodically
|
||||
an aggregated report to the ``MgrStatMonitor`` service on mon. As explained earlier,
|
||||
this service just persists the health reports in the aggregated report to the monstore.
|
||||
|
||||
.. seqdiag::
|
||||
|
||||
seqdiag {
|
||||
default_note_color = lightblue;
|
||||
service; mgr; mon-mgr-stat; mon-health;
|
||||
service -> mgr [ label = "send(open)" ];
|
||||
mgr -> mgr [ note = "register the new service" ];
|
||||
service <-- mgr;
|
||||
mgr => service [ label = "send(configure)" ];
|
||||
service -> mgr [ label = "send(report)" ];
|
||||
mgr -> mgr [ note = "update/aggregate service metrics" ];
|
||||
service <-- mgr;
|
||||
service => mgr [ label = "send(report)" ];
|
||||
mgr -> mon-mgr-stat [ label = "send(mgr-report)" ];
|
||||
mon-mgr-stat -> mon-mgr-stat [ note = "store health checks in the report" ];
|
||||
mgr <-- mon-mgr-stat;
|
||||
mon-health => mon-mgr-stat [ label = "gather health checks" ];
|
||||
service => mgr [ label = "send(report)" ];
|
||||
service => mgr [ label = "send(close)" ];
|
||||
}
|
||||
|
@ -208,31 +208,32 @@ A Ceph daemon has the ability to emit a set of perf counter instances with varyi
|
||||
For example, the below counters show the number of put requests for different users on different buckets::
|
||||
|
||||
{
|
||||
"rgw": {
|
||||
"labels": {
|
||||
"Bucket: "bkt1",
|
||||
"User: "user1",
|
||||
},
|
||||
"counters": {
|
||||
"put": 1,
|
||||
},
|
||||
},
|
||||
"rgw": {
|
||||
"labels": {
|
||||
},
|
||||
"counters": {
|
||||
"put": 4,
|
||||
},
|
||||
},
|
||||
"rgw": {
|
||||
"labels": {
|
||||
"Bucket: "bkt1",
|
||||
"User: "user2",
|
||||
},
|
||||
"counters": {
|
||||
"put": 3,
|
||||
},
|
||||
}
|
||||
"rgw": [
|
||||
{
|
||||
"labels": {
|
||||
"Bucket: "bkt1",
|
||||
"User: "user1",
|
||||
},
|
||||
"counters": {
|
||||
"put": 1,
|
||||
},
|
||||
},
|
||||
{
|
||||
"labels": {},
|
||||
"counters": {
|
||||
"put": 4,
|
||||
},
|
||||
},
|
||||
{
|
||||
"labels": {
|
||||
"Bucket: "bkt1",
|
||||
"User: "user2",
|
||||
},
|
||||
"counters": {
|
||||
"put": 3,
|
||||
},
|
||||
},
|
||||
]
|
||||
}
|
||||
|
||||
All labeled and unlabeled perf counters can be viewed with ``ceph daemon {daemon id} counter dump``.
|
||||
|
@ -137,6 +137,6 @@ First release candidate
|
||||
First stable release
|
||||
====================
|
||||
|
||||
- [ ] src/ceph_release: change type `stable`
|
||||
- [x] src/ceph_release: change type `stable`
|
||||
- [ ] generate new object corpus for encoding/decoding tests - see :doc:`corpus`
|
||||
- [ ] src/cephadm/cephadm: update `LATEST_STABLE_RELEASE`
|
||||
|
@ -29,21 +29,21 @@ Premier
|
||||
|
||||
* `Bloomberg <https://bloomberg.com>`_
|
||||
* `Clyso <https://www.clyso.com/en/>`_
|
||||
* `DigitalOcean <https://www.digitalocean.com/>`_
|
||||
* `IBM <https://ibm.com>`_
|
||||
* `Intel <http://www.intel.com/>`_
|
||||
* `OVH <https://www.ovh.com/>`_
|
||||
* `Samsung Electronics <https://samsung.com/>`_
|
||||
* `Western Digital <https://www.wdc.com/>`_
|
||||
* `XSKY <https://www.xsky.com/en/>`_
|
||||
* `ZTE <https://www.zte.com.cn/global/>`_
|
||||
|
||||
General
|
||||
-------
|
||||
|
||||
* `42on <https://www.42on.com/>`_
|
||||
* `Akamai <https://www.akamai.com/>`_
|
||||
* `ARM <http://www.arm.com/>`_
|
||||
* `Canonical <https://www.canonical.com/>`_
|
||||
* `Cloudbase Solutions <https://cloudbase.it/>`_
|
||||
* `CloudFerro <https://cloudferro.com/>`_
|
||||
* `croit <http://www.croit.io/>`_
|
||||
* `EasyStack <https://www.easystack.io/>`_
|
||||
* `ISS <http://iss-integration.com/>`_
|
||||
@ -96,7 +96,6 @@ Members
|
||||
-------
|
||||
|
||||
* Anjaneya "Reddy" Chagam (Intel)
|
||||
* Alex Marangone (DigitalOcean)
|
||||
* Carlos Maltzahn (UCSC) - Associate member representative
|
||||
* Dan van der Ster (CERN) - Ceph Council representative
|
||||
* Haomai Wang (XSKY)
|
||||
@ -111,8 +110,6 @@ Members
|
||||
* Steven Umbehocker (OSNexus) - General member representative
|
||||
* Pawel Sadowski (OVH)
|
||||
* Vincent Hsu (IBM)
|
||||
* Xie Xingguo (ZTE)
|
||||
* Zhang Shaowen (China Mobile)
|
||||
|
||||
Joining
|
||||
=======
|
||||
|
@ -80,8 +80,8 @@ Current Members
|
||||
|
||||
* Adam King <adking@redhat.com>
|
||||
* Casey Bodley <cbodley@redhat.com>
|
||||
* Dan van der Ster <daniel.vanderster@cern.ch>
|
||||
* David Orman <ormandj@iland.com>
|
||||
* Dan van der Ster <dan.vanderster@clyso.com>
|
||||
* David Orman <ormandj@1111systems.com>
|
||||
* Ernesto Puerta <epuerta@redhat.com>
|
||||
* Gregory Farnum <gfarnum@redhat.com>
|
||||
* Haomai Wang <haomai@xsky.com>
|
||||
|
@ -41,14 +41,16 @@ So, prior to start consuming the Ceph API, a valid JSON Web Token (JWT) has to
|
||||
be obtained, and it may then be reused for subsequent requests. The
|
||||
``/api/auth`` endpoint will provide the valid token:
|
||||
|
||||
.. code-block:: sh
|
||||
.. prompt:: bash $
|
||||
|
||||
$ curl -X POST "https://example.com:8443/api/auth" \
|
||||
-H "Accept: application/vnd.ceph.api.v1.0+json" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"username": <username>, "password": <password>}'
|
||||
curl -X POST "https://example.com:8443/api/auth" \
|
||||
-H "Accept: application/vnd.ceph.api.v1.0+json" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"username": <username>, "password": <password>}'
|
||||
|
||||
{ "token": "<redacted_token>", ...}
|
||||
::
|
||||
|
||||
{ "token": "<redacted_token>", ...}
|
||||
|
||||
The token obtained must be passed together with every API request in the
|
||||
``Authorization`` HTTP header::
|
||||
@ -74,11 +76,11 @@ purpose, Ceph API is built upon the following principles:
|
||||
|
||||
An example:
|
||||
|
||||
.. code-block:: bash
|
||||
.. prompt:: bash $
|
||||
|
||||
$ curl -X GET "https://example.com:8443/api/osd" \
|
||||
-H "Accept: application/vnd.ceph.api.v1.0+json" \
|
||||
-H "Authorization: Bearer <token>"
|
||||
curl -X GET "https://example.com:8443/api/osd" \
|
||||
-H "Accept: application/vnd.ceph.api.v1.0+json" \
|
||||
-H "Authorization: Bearer <token>"
|
||||
|
||||
|
||||
Specification
|
||||
|
@ -31,7 +31,7 @@ Create NFS Ganesha Cluster
|
||||
|
||||
.. code:: bash
|
||||
|
||||
$ nfs cluster create <cluster_id> [<placement>] [--ingress] [--virtual_ip <value>] [--ingress-mode {default|keepalive-only}] [--port <int>]
|
||||
$ ceph nfs cluster create <cluster_id> [<placement>] [--ingress] [--virtual_ip <value>] [--ingress-mode {default|keepalive-only|haproxy-standard|haproxy-protocol}] [--port <int>]
|
||||
|
||||
This creates a common recovery pool for all NFS Ganesha daemons, new user based on
|
||||
``cluster_id``, and a common NFS Ganesha config RADOS object.
|
||||
|
@ -25,7 +25,7 @@ supports both passing the arguments through the cmd line or as a spec file:
|
||||
|
||||
.. prompt:: bash #
|
||||
|
||||
rgw realm bootstrap [--realm-name] [--zonegroup-name] [--zone-name] [--port] [--placement] [--start-radosgw]
|
||||
ceph rgw realm bootstrap [--realm-name] [--zonegroup-name] [--zone-name] [--port] [--placement] [--start-radosgw]
|
||||
|
||||
The command supports providing the configuration through a spec file (`-i option`):
|
||||
|
||||
@ -33,7 +33,7 @@ The command supports providing the configuration through a spec file (`-i option
|
||||
|
||||
ceph rgw realm bootstrap -i myrgw.yaml
|
||||
|
||||
Following is an example of RGW mutlisite spec file:
|
||||
Following is an example of RGW multisite spec file:
|
||||
|
||||
.. code-block:: yaml
|
||||
|
||||
|
@ -4,116 +4,116 @@
|
||||
Configuring Ceph
|
||||
==================
|
||||
|
||||
When Ceph services start, the initialization process activates a series
|
||||
of daemons that run in the background. A :term:`Ceph Storage Cluster` runs
|
||||
at a minimum three types of daemons:
|
||||
When Ceph services start, the initialization process activates a series of
|
||||
daemons that run in the background. A :term:`Ceph Storage Cluster` runs at
|
||||
least three types of daemons:
|
||||
|
||||
- :term:`Ceph Monitor` (``ceph-mon``)
|
||||
- :term:`Ceph Manager` (``ceph-mgr``)
|
||||
- :term:`Ceph OSD Daemon` (``ceph-osd``)
|
||||
|
||||
Ceph Storage Clusters that support the :term:`Ceph File System` also run at
|
||||
least one :term:`Ceph Metadata Server` (``ceph-mds``). Clusters that
|
||||
support :term:`Ceph Object Storage` run Ceph RADOS Gateway daemons
|
||||
(``radosgw``) as well.
|
||||
least one :term:`Ceph Metadata Server` (``ceph-mds``). Clusters that support
|
||||
:term:`Ceph Object Storage` run Ceph RADOS Gateway daemons (``radosgw``).
|
||||
|
||||
Each daemon has a number of configuration options, each of which has a
|
||||
default value. You may adjust the behavior of the system by changing these
|
||||
configuration options. Be careful to understand the consequences before
|
||||
Each daemon has a number of configuration options, each of which has a default
|
||||
value. You may adjust the behavior of the system by changing these
|
||||
configuration options. Be careful to understand the consequences before
|
||||
overriding default values, as it is possible to significantly degrade the
|
||||
performance and stability of your cluster. Also note that default values
|
||||
sometimes change between releases, so it is best to review the version of
|
||||
this documentation that aligns with your Ceph release.
|
||||
performance and stability of your cluster. Note too that default values
|
||||
sometimes change between releases. For this reason, it is best to review the
|
||||
version of this documentation that applies to your Ceph release.
|
||||
|
||||
Option names
|
||||
============
|
||||
|
||||
All Ceph configuration options have a unique name consisting of words
|
||||
formed with lower-case characters and connected with underscore
|
||||
(``_``) characters.
|
||||
Each of the Ceph configuration options has a unique name that consists of words
|
||||
formed with lowercase characters and connected with underscore characters
|
||||
(``_``).
|
||||
|
||||
When option names are specified on the command line, either underscore
|
||||
(``_``) or dash (``-``) characters can be used interchangeable (e.g.,
|
||||
When option names are specified on the command line, underscore (``_``) and
|
||||
dash (``-``) characters can be used interchangeably (for example,
|
||||
``--mon-host`` is equivalent to ``--mon_host``).
|
||||
|
||||
When option names appear in configuration files, spaces can also be
|
||||
used in place of underscore or dash. We suggest, though, that for
|
||||
clarity and convenience you consistently use underscores, as we do
|
||||
When option names appear in configuration files, spaces can also be used in
|
||||
place of underscores or dashes. However, for the sake of clarity and
|
||||
convenience, we suggest that you consistently use underscores, as we do
|
||||
throughout this documentation.
|
||||
|
||||
Config sources
|
||||
==============
|
||||
|
||||
Each Ceph daemon, process, and library will pull its configuration
|
||||
from several sources, listed below. Sources later in the list will
|
||||
override those earlier in the list when both are present.
|
||||
Each Ceph daemon, process, and library pulls its configuration from one or more
|
||||
of the several sources listed below. Sources that occur later in the list
|
||||
override those that occur earlier in the list (when both are present).
|
||||
|
||||
- the compiled-in default value
|
||||
- the monitor cluster's centralized configuration database
|
||||
- a configuration file stored on the local host
|
||||
- environment variables
|
||||
- command line arguments
|
||||
- runtime overrides set by an administrator
|
||||
- command-line arguments
|
||||
- runtime overrides that are set by an administrator
|
||||
|
||||
One of the first things a Ceph process does on startup is parse the
|
||||
configuration options provided via the command line, environment, and
|
||||
local configuration file. The process will then contact the monitor
|
||||
cluster to retrieve configuration stored centrally for the entire
|
||||
cluster. Once a complete view of the configuration is available, the
|
||||
daemon or process startup will proceed.
|
||||
configuration options provided via the command line, via the environment, and
|
||||
via the local configuration file. Next, the process contacts the monitor
|
||||
cluster to retrieve centrally-stored configuration for the entire cluster.
|
||||
After a complete view of the configuration is available, the startup of the
|
||||
daemon or process will commence.
|
||||
|
||||
.. _bootstrap-options:
|
||||
|
||||
Bootstrap options
|
||||
-----------------
|
||||
|
||||
Some configuration options affect the process's ability to contact the
|
||||
monitors, to authenticate, and to retrieve the cluster-stored configuration.
|
||||
For this reason, these options might need to be stored locally on the node, and
|
||||
set by means of a local configuration file. These options include the
|
||||
following:
|
||||
Bootstrap options are configuration options that affect the process's ability
|
||||
to contact the monitors, to authenticate, and to retrieve the cluster-stored
|
||||
configuration. For this reason, these options might need to be stored locally
|
||||
on the node, and set by means of a local configuration file. These options
|
||||
include the following:
|
||||
|
||||
.. confval:: mon_host
|
||||
.. confval:: mon_host_override
|
||||
|
||||
- :confval:`mon_dns_srv_name`
|
||||
- :confval:`mon_data`, :confval:`osd_data`, :confval:`mds_data`, :confval:`mgr_data`, and
|
||||
similar options that define which local directory the daemon
|
||||
stores its data in.
|
||||
- :confval:`keyring`, :confval:`keyfile`, and/or :confval:`key`, which can be used to
|
||||
specify the authentication credential to use to authenticate with
|
||||
the monitor. Note that in most cases the default keyring location
|
||||
is in the data directory specified above.
|
||||
- :confval:`mon_data`, :confval:`osd_data`, :confval:`mds_data`,
|
||||
:confval:`mgr_data`, and similar options that define which local directory
|
||||
the daemon stores its data in.
|
||||
- :confval:`keyring`, :confval:`keyfile`, and/or :confval:`key`, which can be
|
||||
used to specify the authentication credential to use to authenticate with the
|
||||
monitor. Note that in most cases the default keyring location is in the data
|
||||
directory specified above.
|
||||
|
||||
In most cases, the default values of these options are suitable. There is one
|
||||
exception to this: the :confval:`mon_host` option that identifies the addresses
|
||||
of the cluster's monitors. When DNS is used to identify monitors, a local Ceph
|
||||
In most cases, there is no reason to modify the default values of these
|
||||
options. However, there is one exception to this: the :confval:`mon_host`
|
||||
option that identifies the addresses of the cluster's monitors. But when
|
||||
:ref:`DNS is used to identify monitors<mon-dns-lookup>`, a local Ceph
|
||||
configuration file can be avoided entirely.
|
||||
|
||||
|
||||
Skipping monitor config
|
||||
-----------------------
|
||||
|
||||
Pass the option ``--no-mon-config`` to any process to skip the step that
|
||||
retrieves configuration information from the cluster monitors. This is useful
|
||||
in cases where configuration is managed entirely via configuration files, or
|
||||
when the monitor cluster is down and some maintenance activity needs to be
|
||||
done.
|
||||
|
||||
The option ``--no-mon-config`` can be passed in any command in order to skip
|
||||
the step that retrieves configuration information from the cluster's monitors.
|
||||
Skipping this retrieval step can be useful in cases where configuration is
|
||||
managed entirely via configuration files, or when maintenance activity needs to
|
||||
be done but the monitor cluster is down.
|
||||
|
||||
.. _ceph-conf-file:
|
||||
|
||||
|
||||
Configuration sections
|
||||
======================
|
||||
|
||||
Any given process or daemon has a single value for each configuration
|
||||
option. However, values for an option may vary across different
|
||||
daemon types even daemons of the same type. Ceph options that are
|
||||
stored in the monitor configuration database or in local configuration
|
||||
files are grouped into sections to indicate which daemons or clients
|
||||
they apply to.
|
||||
Each of the configuration options associated with a single process or daemon
|
||||
has a single value. However, the values for a configuration option can vary
|
||||
across daemon types, and can vary even across different daemons of the same
|
||||
type. Ceph options that are stored in the monitor configuration database or in
|
||||
local configuration files are grouped into sections |---| so-called "configuration
|
||||
sections" |---| to indicate which daemons or clients they apply to.
|
||||
|
||||
These sections include:
|
||||
|
||||
These sections include the following:
|
||||
|
||||
.. confsec:: global
|
||||
|
||||
@ -156,43 +156,42 @@ These sections include:
|
||||
|
||||
.. confsec:: client
|
||||
|
||||
Settings under ``client`` affect all Ceph Clients
|
||||
(e.g., mounted Ceph File Systems, mounted Ceph Block Devices,
|
||||
etc.) as well as Rados Gateway (RGW) daemons.
|
||||
Settings under ``client`` affect all Ceph clients
|
||||
(for example, mounted Ceph File Systems, mounted Ceph Block Devices)
|
||||
as well as RADOS Gateway (RGW) daemons.
|
||||
|
||||
:example: ``objecter_inflight_ops = 512``
|
||||
|
||||
|
||||
Sections may also specify an individual daemon or client name. For example,
|
||||
Configuration sections can also specify an individual daemon or client name. For example,
|
||||
``mon.foo``, ``osd.123``, and ``client.smith`` are all valid section names.
|
||||
|
||||
|
||||
Any given daemon will draw its settings from the global section, the
|
||||
daemon or client type section, and the section sharing its name.
|
||||
Settings in the most-specific section take precedence, so for example
|
||||
if the same option is specified in both :confsec:`global`, :confsec:`mon`, and
|
||||
``mon.foo`` on the same source (i.e., in the same configurationfile),
|
||||
the ``mon.foo`` value will be used.
|
||||
Any given daemon will draw its settings from the global section, the daemon- or
|
||||
client-type section, and the section sharing its name. Settings in the
|
||||
most-specific section take precedence so precedence: for example, if the same
|
||||
option is specified in both :confsec:`global`, :confsec:`mon`, and ``mon.foo``
|
||||
on the same source (i.e. that is, in the same configuration file), the
|
||||
``mon.foo`` setting will be used.
|
||||
|
||||
If multiple values of the same configuration option are specified in the same
|
||||
section, the last value wins.
|
||||
|
||||
Note that values from the local configuration file always take
|
||||
precedence over values from the monitor configuration database,
|
||||
regardless of which section they appear in.
|
||||
section, the last value specified takes precedence.
|
||||
|
||||
Note that values from the local configuration file always take precedence over
|
||||
values from the monitor configuration database, regardless of the section in
|
||||
which they appear.
|
||||
|
||||
.. _ceph-metavariables:
|
||||
|
||||
Metavariables
|
||||
=============
|
||||
|
||||
Metavariables simplify Ceph Storage Cluster configuration
|
||||
dramatically. When a metavariable is set in a configuration value,
|
||||
Ceph expands the metavariable into a concrete value at the time the
|
||||
configuration value is used. Ceph metavariables are similar to variable expansion in the Bash shell.
|
||||
Metavariables dramatically simplify Ceph storage cluster configuration. When a
|
||||
metavariable is set in a configuration value, Ceph expands the metavariable at
|
||||
the time the configuration value is used. In this way, Ceph metavariables
|
||||
behave similarly to the way that variable expansion works in the Bash shell.
|
||||
|
||||
Ceph supports the following metavariables:
|
||||
Ceph supports the following metavariables:
|
||||
|
||||
.. describe:: $cluster
|
||||
|
||||
@ -204,7 +203,7 @@ Ceph supports the following metavariables:
|
||||
|
||||
.. describe:: $type
|
||||
|
||||
Expands to a daemon or process type (e.g., ``mds``, ``osd``, or ``mon``)
|
||||
Expands to a daemon or process type (for example, ``mds``, ``osd``, or ``mon``)
|
||||
|
||||
:example: ``/var/lib/ceph/$type``
|
||||
|
||||
@ -233,33 +232,32 @@ Ceph supports the following metavariables:
|
||||
:example: ``/var/run/ceph/$cluster-$name-$pid.asok``
|
||||
|
||||
|
||||
|
||||
The Configuration File
|
||||
======================
|
||||
Ceph configuration file
|
||||
=======================
|
||||
|
||||
On startup, Ceph processes search for a configuration file in the
|
||||
following locations:
|
||||
|
||||
#. ``$CEPH_CONF`` (*i.e.,* the path following the ``$CEPH_CONF``
|
||||
#. ``$CEPH_CONF`` (that is, the path following the ``$CEPH_CONF``
|
||||
environment variable)
|
||||
#. ``-c path/path`` (*i.e.,* the ``-c`` command line argument)
|
||||
#. ``-c path/path`` (that is, the ``-c`` command line argument)
|
||||
#. ``/etc/ceph/$cluster.conf``
|
||||
#. ``~/.ceph/$cluster.conf``
|
||||
#. ``./$cluster.conf`` (*i.e.,* in the current working directory)
|
||||
#. ``./$cluster.conf`` (that is, in the current working directory)
|
||||
#. On FreeBSD systems only, ``/usr/local/etc/ceph/$cluster.conf``
|
||||
|
||||
where ``$cluster`` is the cluster's name (default ``ceph``).
|
||||
Here ``$cluster`` is the cluster's name (default: ``ceph``).
|
||||
|
||||
The Ceph configuration file uses an *ini* style syntax. You can add comment
|
||||
text after a pound sign (#) or a semi-colon (;). For example:
|
||||
The Ceph configuration file uses an ``ini`` style syntax. You can add "comment
|
||||
text" after a pound sign (#) or a semi-colon semicolon (;). For example:
|
||||
|
||||
.. code-block:: ini
|
||||
|
||||
# <--A number (#) sign precedes a comment.
|
||||
; A comment may be anything.
|
||||
# Comments always follow a semi-colon (;) or a pound (#) on each line.
|
||||
# The end of the line terminates a comment.
|
||||
# We recommend that you provide comments in your configuration file(s).
|
||||
# <--A number (#) sign number sign (#) precedes a comment.
|
||||
; A comment may be anything.
|
||||
# Comments always follow a semi-colon semicolon (;) or a pound sign (#) on each line.
|
||||
# The end of the line terminates a comment.
|
||||
# We recommend that you provide comments in your configuration file(s).
|
||||
|
||||
|
||||
.. _ceph-conf-settings:
|
||||
@ -268,40 +266,41 @@ Config file section names
|
||||
-------------------------
|
||||
|
||||
The configuration file is divided into sections. Each section must begin with a
|
||||
valid configuration section name (see `Configuration sections`_, above)
|
||||
surrounded by square brackets. For example,
|
||||
valid configuration section name (see `Configuration sections`_, above) that is
|
||||
surrounded by square brackets. For example:
|
||||
|
||||
.. code-block:: ini
|
||||
|
||||
[global]
|
||||
debug_ms = 0
|
||||
|
||||
[osd]
|
||||
debug_ms = 1
|
||||
[global]
|
||||
debug_ms = 0
|
||||
|
||||
[osd.1]
|
||||
debug_ms = 10
|
||||
[osd]
|
||||
debug_ms = 1
|
||||
|
||||
[osd.2]
|
||||
debug_ms = 10
|
||||
[osd.1]
|
||||
debug_ms = 10
|
||||
|
||||
[osd.2]
|
||||
debug_ms = 10
|
||||
|
||||
Config file option values
|
||||
-------------------------
|
||||
|
||||
The value of a configuration option is a string. If it is too long to
|
||||
fit in a single line, you can put a backslash (``\``) at the end of line
|
||||
as the line continuation marker, so the value of the option will be
|
||||
the string after ``=`` in current line combined with the string in the next
|
||||
line::
|
||||
The value of a configuration option is a string. If the string is too long to
|
||||
fit on a single line, you can put a backslash (``\``) at the end of the line
|
||||
and the backslash will act as a line continuation marker. In such a case, the
|
||||
value of the option will be the string after ``=`` in the current line,
|
||||
combined with the string in the next line. Here is an example::
|
||||
|
||||
[global]
|
||||
foo = long long ago\
|
||||
long ago
|
||||
|
||||
In the example above, the value of "``foo``" would be "``long long ago long ago``".
|
||||
In this example, the value of the "``foo``" option is "``long long ago long
|
||||
ago``".
|
||||
|
||||
Normally, the option value ends with a new line, or a comment, like
|
||||
An option value typically ends with either a newline or a comment. For
|
||||
example:
|
||||
|
||||
.. code-block:: ini
|
||||
|
||||
@ -309,100 +308,108 @@ Normally, the option value ends with a new line, or a comment, like
|
||||
obscure_one = difficult to explain # I will try harder in next release
|
||||
simpler_one = nothing to explain
|
||||
|
||||
In the example above, the value of "``obscure one``" would be "``difficult to explain``";
|
||||
and the value of "``simpler one`` would be "``nothing to explain``".
|
||||
In this example, the value of the "``obscure one``" option is "``difficult to
|
||||
explain``" and the value of the "``simpler one`` options is "``nothing to
|
||||
explain``".
|
||||
|
||||
If an option value contains spaces, and we want to make it explicit, we
|
||||
could quote the value using single or double quotes, like
|
||||
When an option value contains spaces, it can be enclosed within single quotes
|
||||
or double quotes in order to make its scope clear and in order to make sure
|
||||
that the first space in the value is not interpreted as the end of the value.
|
||||
For example:
|
||||
|
||||
.. code-block:: ini
|
||||
|
||||
[global]
|
||||
line = "to be, or not to be"
|
||||
|
||||
Certain characters are not allowed to be present in the option values directly.
|
||||
They are ``=``, ``#``, ``;`` and ``[``. If we have to, we need to escape them,
|
||||
like
|
||||
In option values, there are four characters that are treated as escape
|
||||
characters: ``=``, ``#``, ``;`` and ``[``. They are permitted to occur in an
|
||||
option value only if they are immediately preceded by the backslash character
|
||||
(``\``). For example:
|
||||
|
||||
.. code-block:: ini
|
||||
|
||||
[global]
|
||||
secret = "i love \# and \["
|
||||
|
||||
Every configuration option is typed with one of the types below:
|
||||
Each configuration option falls under one of the following types:
|
||||
|
||||
.. describe:: int
|
||||
|
||||
64-bit signed integer, Some SI prefixes are supported, like "K", "M", "G",
|
||||
"T", "P", "E", meaning, respectively, 10\ :sup:`3`, 10\ :sup:`6`,
|
||||
10\ :sup:`9`, etc. And "B" is the only supported unit. So, "1K", "1M", "128B" and "-1" are all valid
|
||||
option values. Some times, a negative value implies "unlimited" when it comes to
|
||||
an option for threshold or limit.
|
||||
64-bit signed integer. Some SI suffixes are supported, such as "K", "M",
|
||||
"G", "T", "P", and "E" (meaning, respectively, 10\ :sup:`3`, 10\ :sup:`6`,
|
||||
10\ :sup:`9`, etc.). "B" is the only supported unit string. Thus "1K", "1M",
|
||||
"128B" and "-1" are all valid option values. When a negative value is
|
||||
assigned to a threshold option, this can indicate that the option is
|
||||
"unlimited" -- that is, that there is no threshold or limit in effect.
|
||||
|
||||
:example: ``42``, ``-1``
|
||||
|
||||
.. describe:: uint
|
||||
|
||||
It is almost identical to ``integer``. But a negative value will be rejected.
|
||||
This differs from ``integer`` only in that negative values are not
|
||||
permitted.
|
||||
|
||||
:example: ``256``, ``0``
|
||||
|
||||
.. describe:: str
|
||||
|
||||
Free style strings encoded in UTF-8, but some characters are not allowed. Please
|
||||
reference the above notes for the details.
|
||||
A string encoded in UTF-8. Certain characters are not permitted. Reference
|
||||
the above notes for the details.
|
||||
|
||||
:example: ``"hello world"``, ``"i love \#"``, ``yet-another-name``
|
||||
|
||||
.. describe:: boolean
|
||||
|
||||
one of the two values ``true`` or ``false``. But an integer is also accepted,
|
||||
where "0" implies ``false``, and any non-zero values imply ``true``.
|
||||
Typically either of the two values ``true`` or ``false``. However, any
|
||||
integer is permitted: "0" implies ``false``, and any non-zero value implies
|
||||
``true``.
|
||||
|
||||
:example: ``true``, ``false``, ``1``, ``0``
|
||||
|
||||
.. describe:: addr
|
||||
|
||||
a single address optionally prefixed with ``v1``, ``v2`` or ``any`` for the messenger
|
||||
protocol. If the prefix is not specified, ``v2`` protocol is used. Please see
|
||||
:ref:`address_formats` for more details.
|
||||
A single address, optionally prefixed with ``v1``, ``v2`` or ``any`` for the
|
||||
messenger protocol. If no prefix is specified, the ``v2`` protocol is used.
|
||||
For more details, see :ref:`address_formats`.
|
||||
|
||||
:example: ``v1:1.2.3.4:567``, ``v2:1.2.3.4:567``, ``1.2.3.4:567``, ``2409:8a1e:8fb6:aa20:1260:4bff:fe92:18f5::567``, ``[::1]:6789``
|
||||
|
||||
.. describe:: addrvec
|
||||
|
||||
a set of addresses separated by ",". The addresses can be optionally quoted with ``[`` and ``]``.
|
||||
A set of addresses separated by ",". The addresses can be optionally quoted
|
||||
with ``[`` and ``]``.
|
||||
|
||||
:example: ``[v1:1.2.3.4:567,v2:1.2.3.4:568]``, ``v1:1.2.3.4:567,v1:1.2.3.14:567`` ``[2409:8a1e:8fb6:aa20:1260:4bff:fe92:18f5::567], [2409:8a1e:8fb6:aa20:1260:4bff:fe92:18f5::568]``
|
||||
|
||||
.. describe:: uuid
|
||||
|
||||
the string format of a uuid defined by `RFC4122 <https://www.ietf.org/rfc/rfc4122.txt>`_.
|
||||
And some variants are also supported, for more details, see
|
||||
`Boost document <https://www.boost.org/doc/libs/1_74_0/libs/uuid/doc/uuid.html#String%20Generator>`_.
|
||||
The string format of a uuid defined by `RFC4122
|
||||
<https://www.ietf.org/rfc/rfc4122.txt>`_. Certain variants are also
|
||||
supported: for more details, see `Boost document
|
||||
<https://www.boost.org/doc/libs/1_74_0/libs/uuid/doc/uuid.html#String%20Generator>`_.
|
||||
|
||||
:example: ``f81d4fae-7dec-11d0-a765-00a0c91e6bf6``
|
||||
|
||||
.. describe:: size
|
||||
|
||||
denotes a 64-bit unsigned integer. Both SI prefixes and IEC prefixes are
|
||||
supported. And "B" is the only supported unit. A negative value will be
|
||||
rejected.
|
||||
64-bit unsigned integer. Both SI prefixes and IEC prefixes are supported.
|
||||
"B" is the only supported unit string. Negative values are not permitted.
|
||||
|
||||
:example: ``1Ki``, ``1K``, ``1KiB`` and ``1B``.
|
||||
|
||||
.. describe:: secs
|
||||
|
||||
denotes a duration of time. By default the unit is second if not specified.
|
||||
Following units of time are supported:
|
||||
Denotes a duration of time. The default unit of time is the second.
|
||||
The following units of time are supported:
|
||||
|
||||
* second: "s", "sec", "second", "seconds"
|
||||
* minute: "m", "min", "minute", "minutes"
|
||||
* hour: "hs", "hr", "hour", "hours"
|
||||
* day: "d", "day", "days"
|
||||
* week: "w", "wk", "week", "weeks"
|
||||
* month: "mo", "month", "months"
|
||||
* year: "y", "yr", "year", "years"
|
||||
* second: ``s``, ``sec``, ``second``, ``seconds``
|
||||
* minute: ``m``, ``min``, ``minute``, ``minutes``
|
||||
* hour: ``hs``, ``hr``, ``hour``, ``hours``
|
||||
* day: ``d``, ``day``, ``days``
|
||||
* week: ``w``, ``wk``, ``week``, ``weeks``
|
||||
* month: ``mo``, ``month``, ``months``
|
||||
* year: ``y``, ``yr``, ``year``, ``years``
|
||||
|
||||
:example: ``1 m``, ``1m`` and ``1 week``
|
||||
|
||||
@ -411,112 +418,103 @@ Every configuration option is typed with one of the types below:
|
||||
Monitor configuration database
|
||||
==============================
|
||||
|
||||
The monitor cluster manages a database of configuration options that
|
||||
can be consumed by the entire cluster, enabling streamlined central
|
||||
configuration management for the entire system. The vast majority of
|
||||
configuration options can and should be stored here for ease of
|
||||
administration and transparency.
|
||||
The monitor cluster manages a database of configuration options that can be
|
||||
consumed by the entire cluster. This allows for streamlined central
|
||||
configuration management of the entire system. For ease of administration and
|
||||
transparency, the vast majority of configuration options can and should be
|
||||
stored in this database.
|
||||
|
||||
A handful of settings may still need to be stored in local
|
||||
configuration files because they affect the ability to connect to the
|
||||
monitors, authenticate, and fetch configuration information. In most
|
||||
cases this is limited to the ``mon_host`` option, although this can
|
||||
also be avoided through the use of DNS SRV records.
|
||||
Some settings might need to be stored in local configuration files because they
|
||||
affect the ability of the process to connect to the monitors, to authenticate,
|
||||
and to fetch configuration information. In most cases this applies only to the
|
||||
``mon_host`` option. This issue can be avoided by using :ref:`DNS SRV
|
||||
records<mon-dns-lookup>`.
|
||||
|
||||
Sections and masks
|
||||
------------------
|
||||
|
||||
Configuration options stored by the monitor can live in a global
|
||||
section, daemon type section, or specific daemon section, just like
|
||||
options in a configuration file can.
|
||||
Configuration options stored by the monitor can be stored in a global section,
|
||||
in a daemon-type section, or in a specific daemon section. In this, they are
|
||||
no different from the options in a configuration file.
|
||||
|
||||
In addition, options may also have a *mask* associated with them to
|
||||
further restrict which daemons or clients the option applies to.
|
||||
Masks take two forms:
|
||||
In addition, options may have a *mask* associated with them to further restrict
|
||||
which daemons or clients the option applies to. Masks take two forms:
|
||||
|
||||
#. ``type:location`` where *type* is a CRUSH property like `rack` or
|
||||
`host`, and *location* is a value for that property. For example,
|
||||
#. ``type:location`` where ``type`` is a CRUSH property like ``rack`` or
|
||||
``host``, and ``location`` is a value for that property. For example,
|
||||
``host:foo`` would limit the option only to daemons or clients
|
||||
running on a particular host.
|
||||
#. ``class:device-class`` where *device-class* is the name of a CRUSH
|
||||
device class (e.g., ``hdd`` or ``ssd``). For example,
|
||||
#. ``class:device-class`` where ``device-class`` is the name of a CRUSH
|
||||
device class (for example, ``hdd`` or ``ssd``). For example,
|
||||
``class:ssd`` would limit the option only to OSDs backed by SSDs.
|
||||
(This mask has no effect for non-OSD daemons or clients.)
|
||||
(This mask has no effect on non-OSD daemons or clients.)
|
||||
|
||||
When setting a configuration option, the `who` may be a section name,
|
||||
a mask, or a combination of both separated by a slash (``/``)
|
||||
character. For example, ``osd/rack:foo`` would mean all OSD daemons
|
||||
in the ``foo`` rack.
|
||||
|
||||
When viewing configuration options, the section name and mask are
|
||||
generally separated out into separate fields or columns to ease readability.
|
||||
In commands that specify a configuration option, the argument of the option (in
|
||||
the following examples, this is the "who" string) may be a section name, a
|
||||
mask, or a combination of both separated by a slash character (``/``). For
|
||||
example, ``osd/rack:foo`` would refer to all OSD daemons in the ``foo`` rack.
|
||||
|
||||
When configuration options are shown, the section name and mask are presented
|
||||
in separate fields or columns to make them more readable.
|
||||
|
||||
Commands
|
||||
--------
|
||||
|
||||
The following CLI commands are used to configure the cluster:
|
||||
|
||||
* ``ceph config dump`` will dump the entire monitors' configuration
|
||||
* ``ceph config dump`` dumps the entire monitor configuration
|
||||
database for the cluster.
|
||||
|
||||
* ``ceph config get <who>`` will dump configuration options stored in
|
||||
the monitors' configuration database for a specific daemon or client
|
||||
(e.g., ``mds.a``).
|
||||
* ``ceph config get <who>`` dumps the configuration options stored in
|
||||
the monitor configuration database for a specific daemon or client
|
||||
(for example, ``mds.a``).
|
||||
|
||||
* ``ceph config get <who> <option>`` will show a configuration value
|
||||
stored in the monitors' configuration database for a specific daemon
|
||||
or client (e.g., ``mds.a``), or, if not present in the monitors'
|
||||
* ``ceph config get <who> <option>`` shows either a configuration value
|
||||
stored in the monitor configuration database for a specific daemon or client
|
||||
(for example, ``mds.a``), or, if that value is not present in the monitor
|
||||
configuration database, the compiled-in default value.
|
||||
|
||||
* ``ceph config set <who> <option> <value>`` will set a configuration
|
||||
option in the monitors' configuration database.
|
||||
* ``ceph config set <who> <option> <value>`` specifies a configuration
|
||||
option in the monitor configuration database.
|
||||
|
||||
* ``ceph config show <who>`` will show the reported running
|
||||
configuration for a running daemon. These settings may differ from
|
||||
those stored by the monitors if there are also local configuration
|
||||
files in use or options have been overridden on the command line or
|
||||
at run time. The source of the option values is reported as part
|
||||
of the output.
|
||||
* ``ceph config show <who>`` shows the configuration for a running daemon.
|
||||
These settings might differ from those stored by the monitors if there are
|
||||
also local configuration files in use or if options have been overridden on
|
||||
the command line or at run time. The source of the values of the options is
|
||||
displayed in the output.
|
||||
|
||||
* ``ceph config assimilate-conf -i <input file> -o <output file>``
|
||||
will ingest a configuration file from *input file* and move any
|
||||
valid options into the monitors' configuration database. Any
|
||||
settings that are unrecognized, invalid, or cannot be controlled by
|
||||
the monitor will be returned in an abbreviated config file stored in
|
||||
*output file*. This command is useful for transitioning from legacy
|
||||
configuration files to centralized monitor-based configuration.
|
||||
* ``ceph config assimilate-conf -i <input file> -o <output file>`` ingests a
|
||||
configuration file from *input file* and moves any valid options into the
|
||||
monitor configuration database. Any settings that are unrecognized, are
|
||||
invalid, or cannot be controlled by the monitor will be returned in an
|
||||
abbreviated configuration file stored in *output file*. This command is
|
||||
useful for transitioning from legacy configuration files to centralized
|
||||
monitor-based configuration.
|
||||
|
||||
Note that ``ceph config set <who> <option> <value>`` and ``ceph config get
|
||||
<who> <option>`` aren't symmetric because the latter also shows compiled-in
|
||||
default values. In order to determine whether a configuration option is
|
||||
present in the monitors' configuration database, use ``ceph config dump``.
|
||||
|
||||
<who> <option>`` will not necessarily return the same values. The latter
|
||||
command will show compiled-in default values. In order to determine whether a
|
||||
configuration option is present in the monitor configuration database, run
|
||||
``ceph config dump``.
|
||||
|
||||
Help
|
||||
====
|
||||
|
||||
You can get help for a particular option with:
|
||||
To get help for a particular option, run the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph config help <option>
|
||||
|
||||
Note that this will use the configuration schema that is compiled into the running monitors. If you have a mixed-version cluster (e.g., during an upgrade), you might also want to query the option schema from a specific running daemon:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph daemon <name> config help [option]
|
||||
|
||||
For example:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph config help log_file
|
||||
|
||||
::
|
||||
::
|
||||
|
||||
log_file - path to log file
|
||||
log_file - path to log file
|
||||
(std::string, basic)
|
||||
Default (non-daemon):
|
||||
Default (daemon): /var/log/ceph/$cluster-$name.log
|
||||
@ -553,20 +551,29 @@ or:
|
||||
"can_update_at_runtime": false
|
||||
}
|
||||
|
||||
The ``level`` property can be any of `basic`, `advanced`, or `dev`.
|
||||
The `dev` options are intended for use by developers, generally for
|
||||
testing purposes, and are not recommended for use by operators.
|
||||
The ``level`` property can be ``basic``, ``advanced``, or ``dev``. The `dev`
|
||||
options are intended for use by developers, generally for testing purposes, and
|
||||
are not recommended for use by operators.
|
||||
|
||||
.. note:: This command uses the configuration schema that is compiled into the
|
||||
running monitors. If you have a mixed-version cluster (as might exist, for
|
||||
example, during an upgrade), you might want to query the option schema from
|
||||
a specific running daemon by running a command of the following form:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph daemon <name> config help [option]
|
||||
|
||||
Runtime Changes
|
||||
===============
|
||||
|
||||
In most cases, Ceph permits changes to the configuration of a daemon at
|
||||
runtime. This can be used for increasing or decreasing the amount of logging
|
||||
run time. This can be used for increasing or decreasing the amount of logging
|
||||
output, for enabling or disabling debug settings, and for runtime optimization.
|
||||
|
||||
Configuration options can be updated via the ``ceph config set`` command. For
|
||||
example, to enable the debug log level on a specific OSD, run a command of this form:
|
||||
Use the ``ceph config set`` command to update configuration options. For
|
||||
example, to enable the most verbose debug log level on a specific OSD, run a
|
||||
command of the following form:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
@ -575,129 +582,133 @@ example, to enable the debug log level on a specific OSD, run a command of this
|
||||
.. note:: If an option has been customized in a local configuration file, the
|
||||
`central config
|
||||
<https://ceph.io/en/news/blog/2018/new-mimic-centralized-configuration-management/>`_
|
||||
setting will be ignored (it has a lower priority than the local
|
||||
configuration file).
|
||||
setting will be ignored because it has a lower priority than the local
|
||||
configuration file.
|
||||
|
||||
.. note:: Log levels range from 0 to 20.
|
||||
|
||||
Override values
|
||||
---------------
|
||||
|
||||
Options can be set temporarily by using the `tell` or `daemon` interfaces on
|
||||
the Ceph CLI. These *override* values are ephemeral, which means that they
|
||||
affect only the current instance of the daemon and revert to persistently
|
||||
configured values when the daemon restarts.
|
||||
Options can be set temporarily by using the Ceph CLI ``tell`` or ``daemon``
|
||||
interfaces on the Ceph CLI. These *override* values are ephemeral, which means
|
||||
that they affect only the current instance of the daemon and revert to
|
||||
persistently configured values when the daemon restarts.
|
||||
|
||||
Override values can be set in two ways:
|
||||
|
||||
#. From any host, send a message to a daemon with a command of the following
|
||||
form:
|
||||
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph tell <name> config set <option> <value>
|
||||
|
||||
For example:
|
||||
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph tell osd.123 config set debug_osd 20
|
||||
|
||||
The ``tell`` command can also accept a wildcard as the daemon identifier.
|
||||
For example, to adjust the debug level on all OSD daemons, run a command of
|
||||
this form:
|
||||
|
||||
the following form:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph tell osd.* config set debug_osd 20
|
||||
|
||||
#. On the host where the daemon is running, connect to the daemon via a socket
|
||||
in ``/var/run/ceph`` by running a command of this form:
|
||||
in ``/var/run/ceph`` by running a command of the following form:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph daemon <name> config set <option> <value>
|
||||
|
||||
For example:
|
||||
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph daemon osd.4 config set debug_osd 20
|
||||
|
||||
.. note:: In the output of the ``ceph config show`` command, these temporary
|
||||
values are shown with a source of ``override``.
|
||||
values are shown to have a source of ``override``.
|
||||
|
||||
|
||||
Viewing runtime settings
|
||||
========================
|
||||
|
||||
You can see the current options set for a running daemon with the ``ceph config show`` command. For example:
|
||||
You can see the current settings specified for a running daemon with the ``ceph
|
||||
config show`` command. For example, to see the (non-default) settings for the
|
||||
daemon ``osd.0``, run the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph config show osd.0
|
||||
|
||||
will show you the (non-default) options for that daemon. You can also look at a specific option with:
|
||||
To see a specific setting, run the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph config show osd.0 debug_osd
|
||||
|
||||
or view all options (even those with default values) with:
|
||||
To see all settings (including those with default values), run the following
|
||||
command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph config show-with-defaults osd.0
|
||||
|
||||
You can also observe settings for a running daemon by connecting to it from the local host via the admin socket. For example:
|
||||
You can see all settings for a daemon that is currently running by connecting
|
||||
to it on the local host via the admin socket. For example, to dump all
|
||||
current settings, run the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph daemon osd.0 config show
|
||||
|
||||
will dump all current settings:
|
||||
To see non-default settings and to see where each value came from (for example,
|
||||
a config file, the monitor, or an override), run the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph daemon osd.0 config diff
|
||||
|
||||
will show only non-default settings (as well as where the value came from: a config file, the monitor, an override, etc.), and:
|
||||
To see the value of a single setting, run the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph daemon osd.0 config get debug_osd
|
||||
|
||||
will report the value of a single option.
|
||||
|
||||
|
||||
|
||||
Changes since Nautilus
|
||||
======================
|
||||
Changes introduced in Octopus
|
||||
=============================
|
||||
|
||||
The Octopus release changed the way the configuration file is parsed.
|
||||
These changes are as follows:
|
||||
|
||||
- Repeated configuration options are allowed, and no warnings will be printed.
|
||||
The value of the last one is used, which means that the setting last in the file
|
||||
is the one that takes effect. Before this change, we would print warning messages
|
||||
when lines with duplicated options were encountered, like::
|
||||
- Repeated configuration options are allowed, and no warnings will be
|
||||
displayed. This means that the setting that comes last in the file is the one
|
||||
that takes effect. Prior to this change, Ceph displayed warning messages when
|
||||
lines containing duplicate options were encountered, such as::
|
||||
|
||||
warning line 42: 'foo' in section 'bar' redefined
|
||||
|
||||
- Invalid UTF-8 options were ignored with warning messages. But since Octopus,
|
||||
they are treated as fatal errors.
|
||||
|
||||
- Backslash ``\`` is used as the line continuation marker to combine the next
|
||||
line with current one. Before Octopus, it was required to follow a backslash with
|
||||
a non-empty line. But in Octopus, an empty line following a backslash is now allowed.
|
||||
|
||||
- Prior to Octopus, options containing invalid UTF-8 characters were ignored
|
||||
with warning messages. But in Octopus, they are treated as fatal errors.
|
||||
- The backslash character ``\`` is used as the line-continuation marker that
|
||||
combines the next line with the current one. Prior to Octopus, there was a
|
||||
requirement that any end-of-line backslash be followed by a non-empty line.
|
||||
But in Octopus, an empty line following a backslash is allowed.
|
||||
- In the configuration file, each line specifies an individual configuration
|
||||
option. The option's name and its value are separated with ``=``, and the
|
||||
value may be quoted using single or double quotes. If an invalid
|
||||
value may be enclosed within single or double quotes. If an invalid
|
||||
configuration is specified, we will treat it as an invalid configuration
|
||||
file ::
|
||||
file::
|
||||
|
||||
bad option ==== bad value
|
||||
- Prior to Octopus, if no section name was specified in the configuration file,
|
||||
all options would be set as though they were within the :confsec:`global`
|
||||
section. This approach is discouraged. Since Octopus, any configuration
|
||||
file that has no section name must contain only a single option.
|
||||
|
||||
- Before Octopus, if no section name was specified in the configuration file,
|
||||
all options would be set as though they were within the :confsec:`global` section. This is
|
||||
now discouraged. Since Octopus, only a single option is allowed for
|
||||
configuration files without a section name.
|
||||
.. |---| unicode:: U+2014 .. EM DASH :trim:
|
||||
|
@ -1,3 +1,5 @@
|
||||
.. _mon-dns-lookup:
|
||||
|
||||
===============================
|
||||
Looking up Monitors through DNS
|
||||
===============================
|
||||
|
@ -2,49 +2,51 @@
|
||||
Adding/Removing OSDs
|
||||
======================
|
||||
|
||||
When you have a cluster up and running, you may add OSDs or remove OSDs
|
||||
from the cluster at runtime.
|
||||
When a cluster is up and running, it is possible to add or remove OSDs.
|
||||
|
||||
Adding OSDs
|
||||
===========
|
||||
|
||||
When you want to expand a cluster, you may add an OSD at runtime. With Ceph, an
|
||||
OSD is generally one Ceph ``ceph-osd`` daemon for one storage drive within a
|
||||
host machine. If your host has multiple storage drives, you may map one
|
||||
``ceph-osd`` daemon for each drive.
|
||||
OSDs can be added to a cluster in order to expand the cluster's capacity and
|
||||
resilience. Typically, an OSD is a Ceph ``ceph-osd`` daemon running on one
|
||||
storage drive within a host machine. But if your host machine has multiple
|
||||
storage drives, you may map one ``ceph-osd`` daemon for each drive on the
|
||||
machine.
|
||||
|
||||
Generally, it's a good idea to check the capacity of your cluster to see if you
|
||||
are reaching the upper end of its capacity. As your cluster reaches its ``near
|
||||
full`` ratio, you should add one or more OSDs to expand your cluster's capacity.
|
||||
It's a good idea to check the capacity of your cluster so that you know when it
|
||||
approaches its capacity limits. If your cluster has reached its ``near full``
|
||||
ratio, then you should add OSDs to expand your cluster's capacity.
|
||||
|
||||
.. warning:: Do not let your cluster reach its ``full ratio`` before
|
||||
adding an OSD. OSD failures that occur after the cluster reaches
|
||||
its ``near full`` ratio may cause the cluster to exceed its
|
||||
``full ratio``.
|
||||
.. warning:: Do not add an OSD after your cluster has reached its ``full
|
||||
ratio``. OSD failures that occur after the cluster reaches its ``near full
|
||||
ratio`` might cause the cluster to exceed its ``full ratio``.
|
||||
|
||||
Deploy your Hardware
|
||||
--------------------
|
||||
|
||||
If you are adding a new host when adding a new OSD, see `Hardware
|
||||
Deploying your Hardware
|
||||
-----------------------
|
||||
|
||||
If you are also adding a new host when adding a new OSD, see `Hardware
|
||||
Recommendations`_ for details on minimum recommendations for OSD hardware. To
|
||||
add an OSD host to your cluster, first make sure you have an up-to-date version
|
||||
of Linux installed, and you have made some initial preparations for your
|
||||
storage drives. See `Filesystem Recommendations`_ for details.
|
||||
add an OSD host to your cluster, begin by making sure that an appropriate
|
||||
version of Linux has been installed on the host machine and that all initial
|
||||
preparations for your storage drives have been carried out. For details, see
|
||||
`Filesystem Recommendations`_.
|
||||
|
||||
Next, add your OSD host to a rack in your cluster, connect the host to the
|
||||
network, and ensure that the host has network connectivity. For details, see
|
||||
`Network Configuration Reference`_.
|
||||
|
||||
Add your OSD host to a rack in your cluster, connect it to the network
|
||||
and ensure that it has network connectivity. See the `Network Configuration
|
||||
Reference`_ for details.
|
||||
|
||||
.. _Hardware Recommendations: ../../../start/hardware-recommendations
|
||||
.. _Filesystem Recommendations: ../../configuration/filesystem-recommendations
|
||||
.. _Network Configuration Reference: ../../configuration/network-config-ref
|
||||
|
||||
Install the Required Software
|
||||
-----------------------------
|
||||
Installing the Required Software
|
||||
--------------------------------
|
||||
|
||||
For manually deployed clusters, you must install Ceph packages
|
||||
manually. See `Installing Ceph (Manual)`_ for details.
|
||||
You should configure SSH to a user with password-less authentication
|
||||
If your cluster has been manually deployed, you will need to install Ceph
|
||||
software packages manually. For details, see `Installing Ceph (Manual)`_.
|
||||
Configure SSH for the appropriate user to have both passwordless authentication
|
||||
and root permissions.
|
||||
|
||||
.. _Installing Ceph (Manual): ../../../install
|
||||
@ -53,48 +55,56 @@ and root permissions.
|
||||
Adding an OSD (Manual)
|
||||
----------------------
|
||||
|
||||
This procedure sets up a ``ceph-osd`` daemon, configures it to use one drive,
|
||||
and configures the cluster to distribute data to the OSD. If your host has
|
||||
multiple drives, you may add an OSD for each drive by repeating this procedure.
|
||||
The following procedure sets up a ``ceph-osd`` daemon, configures this OSD to
|
||||
use one drive, and configures the cluster to distribute data to the OSD. If
|
||||
your host machine has multiple drives, you may add an OSD for each drive on the
|
||||
host by repeating this procedure.
|
||||
|
||||
To add an OSD, create a data directory for it, mount a drive to that directory,
|
||||
add the OSD to the cluster, and then add it to the CRUSH map.
|
||||
As the following procedure will demonstrate, adding an OSD involves creating a
|
||||
metadata directory for it, configuring a data storage drive, adding the OSD to
|
||||
the cluster, and then adding it to the CRUSH map.
|
||||
|
||||
When you add the OSD to the CRUSH map, consider the weight you give to the new
|
||||
OSD. Hard drive capacity grows 40% per year, so newer OSD hosts may have larger
|
||||
hard drives than older hosts in the cluster (i.e., they may have greater
|
||||
weight).
|
||||
When you add the OSD to the CRUSH map, you will need to consider the weight you
|
||||
assign to the new OSD. Since storage drive capacities increase over time, newer
|
||||
OSD hosts are likely to have larger hard drives than the older hosts in the
|
||||
cluster have and therefore might have greater weight as well.
|
||||
|
||||
.. tip:: Ceph prefers uniform hardware across pools. If you are adding drives
|
||||
of dissimilar size, you can adjust their weights. However, for best
|
||||
performance, consider a CRUSH hierarchy with drives of the same type/size.
|
||||
.. tip:: Ceph works best with uniform hardware across pools. It is possible to
|
||||
add drives of dissimilar size and then adjust their weights accordingly.
|
||||
However, for best performance, consider a CRUSH hierarchy that has drives of
|
||||
the same type and size. It is better to add larger drives uniformly to
|
||||
existing hosts. This can be done incrementally, replacing smaller drives
|
||||
each time the new drives are added.
|
||||
|
||||
#. Create the OSD. If no UUID is given, it will be set automatically when the
|
||||
OSD starts up. The following command will output the OSD number, which you
|
||||
will need for subsequent steps:
|
||||
#. Create the new OSD by running a command of the following form. If you opt
|
||||
not to specify a UUID in this command, the UUID will be set automatically
|
||||
when the OSD starts up. The OSD number, which is needed for subsequent
|
||||
steps, is found in the command's output:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd create [{uuid} [{id}]]
|
||||
|
||||
If the optional parameter {id} is given it will be used as the OSD id.
|
||||
Note, in this case the command may fail if the number is already in use.
|
||||
If the optional parameter {id} is specified it will be used as the OSD ID.
|
||||
However, if the ID number is already in use, the command will fail.
|
||||
|
||||
.. warning:: In general, explicitly specifying {id} is not recommended.
|
||||
IDs are allocated as an array, and skipping entries consumes some extra
|
||||
memory. This can become significant if there are large gaps and/or
|
||||
clusters are large. If {id} is not specified, the smallest available is
|
||||
used.
|
||||
.. warning:: Explicitly specifying the ``{id}`` parameter is not
|
||||
recommended. IDs are allocated as an array, and any skipping of entries
|
||||
consumes extra memory. This memory consumption can become significant if
|
||||
there are large gaps or if clusters are large. By leaving the ``{id}``
|
||||
parameter unspecified, we ensure that Ceph uses the smallest ID number
|
||||
available and that these problems are avoided.
|
||||
|
||||
#. Create the default directory on your new OSD:
|
||||
#. Create the default directory for your new OSD by running commands of the
|
||||
following form:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ssh {new-osd-host}
|
||||
sudo mkdir /var/lib/ceph/osd/ceph-{osd-number}
|
||||
|
||||
#. If the OSD is for a drive other than the OS drive, prepare it
|
||||
for use with Ceph, and mount it to the directory you just created:
|
||||
#. If the OSD will be created on a drive other than the OS drive, prepare it
|
||||
for use with Ceph. Run commands of the following form:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
@ -102,41 +112,49 @@ weight).
|
||||
sudo mkfs -t {fstype} /dev/{drive}
|
||||
sudo mount -o user_xattr /dev/{hdd} /var/lib/ceph/osd/ceph-{osd-number}
|
||||
|
||||
#. Initialize the OSD data directory:
|
||||
#. Initialize the OSD data directory by running commands of the following form:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ssh {new-osd-host}
|
||||
ceph-osd -i {osd-num} --mkfs --mkkey
|
||||
|
||||
The directory must be empty before you can run ``ceph-osd``.
|
||||
Make sure that the directory is empty before running ``ceph-osd``.
|
||||
|
||||
#. Register the OSD authentication key. The value of ``ceph`` for
|
||||
``ceph-{osd-num}`` in the path is the ``$cluster-$id``. If your
|
||||
cluster name differs from ``ceph``, use your cluster name instead:
|
||||
#. Register the OSD authentication key by running a command of the following
|
||||
form:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph auth add osd.{osd-num} osd 'allow *' mon 'allow rwx' -i /var/lib/ceph/osd/ceph-{osd-num}/keyring
|
||||
|
||||
#. Add the OSD to the CRUSH map so that the OSD can begin receiving data. The
|
||||
``ceph osd crush add`` command allows you to add OSDs to the CRUSH hierarchy
|
||||
wherever you wish. If you specify at least one bucket, the command
|
||||
will place the OSD into the most specific bucket you specify, *and* it will
|
||||
move that bucket underneath any other buckets you specify. **Important:** If
|
||||
you specify only the root bucket, the command will attach the OSD directly
|
||||
to the root, but CRUSH rules expect OSDs to be inside of hosts.
|
||||
This presentation of the command has ``ceph-{osd-num}`` in the listed path
|
||||
because many clusters have the name ``ceph``. However, if your cluster name
|
||||
is not ``ceph``, then the string ``ceph`` in ``ceph-{osd-num}`` needs to be
|
||||
replaced with your cluster name. For example, if your cluster name is
|
||||
``cluster1``, then the path in the command should be
|
||||
``/var/lib/ceph/osd/cluster1-{osd-num}/keyring``.
|
||||
|
||||
Execute the following:
|
||||
#. Add the OSD to the CRUSH map by running the following command. This allows
|
||||
the OSD to begin receiving data. The ``ceph osd crush add`` command can add
|
||||
OSDs to the CRUSH hierarchy wherever you want. If you specify one or more
|
||||
buckets, the command places the OSD in the most specific of those buckets,
|
||||
and it moves that bucket underneath any other buckets that you have
|
||||
specified. **Important:** If you specify only the root bucket, the command
|
||||
will attach the OSD directly to the root, but CRUSH rules expect OSDs to be
|
||||
inside of hosts. If the OSDs are not inside hosts, the OSDS will likely not
|
||||
receive any data.
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd crush add {id-or-name} {weight} [{bucket-type}={bucket-name} ...]
|
||||
|
||||
You may also decompile the CRUSH map, add the OSD to the device list, add the
|
||||
host as a bucket (if it's not already in the CRUSH map), add the device as an
|
||||
item in the host, assign it a weight, recompile it and set it. See
|
||||
`Add/Move an OSD`_ for details.
|
||||
Note that there is another way to add a new OSD to the CRUSH map: decompile
|
||||
the CRUSH map, add the OSD to the device list, add the host as a bucket (if
|
||||
it is not already in the CRUSH map), add the device as an item in the host,
|
||||
assign the device a weight, recompile the CRUSH map, and set the CRUSH map.
|
||||
For details, see `Add/Move an OSD`_. This is rarely necessary with recent
|
||||
releases (this sentence was written the month that Reef was released).
|
||||
|
||||
|
||||
.. _rados-replacing-an-osd:
|
||||
@ -144,193 +162,206 @@ weight).
|
||||
Replacing an OSD
|
||||
----------------
|
||||
|
||||
.. note:: If the instructions in this section do not work for you, try the
|
||||
instructions in the cephadm documentation: :ref:`cephadm-replacing-an-osd`.
|
||||
.. note:: If the procedure in this section does not work for you, try the
|
||||
instructions in the ``cephadm`` documentation:
|
||||
:ref:`cephadm-replacing-an-osd`.
|
||||
|
||||
When disks fail, or if an administrator wants to reprovision OSDs with a new
|
||||
backend, for instance, for switching from FileStore to BlueStore, OSDs need to
|
||||
be replaced. Unlike `Removing the OSD`_, replaced OSD's id and CRUSH map entry
|
||||
need to be keep intact after the OSD is destroyed for replacement.
|
||||
Sometimes OSDs need to be replaced: for example, when a disk fails, or when an
|
||||
administrator wants to reprovision OSDs with a new back end (perhaps when
|
||||
switching from Filestore to BlueStore). Replacing an OSD differs from `Removing
|
||||
the OSD`_ in that the replaced OSD's ID and CRUSH map entry must be kept intact
|
||||
after the OSD is destroyed for replacement.
|
||||
|
||||
#. Make sure it is safe to destroy the OSD:
|
||||
|
||||
#. Make sure that it is safe to destroy the OSD:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
while ! ceph osd safe-to-destroy osd.{id} ; do sleep 10 ; done
|
||||
|
||||
#. Destroy the OSD first:
|
||||
#. Destroy the OSD:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd destroy {id} --yes-i-really-mean-it
|
||||
|
||||
#. Zap a disk for the new OSD, if the disk was used before for other purposes.
|
||||
It's not necessary for a new disk:
|
||||
#. *Optional*: If the disk that you plan to use is not a new disk and has been
|
||||
used before for other purposes, zap the disk:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph-volume lvm zap /dev/sdX
|
||||
|
||||
#. Prepare the disk for replacement by using the previously destroyed OSD id:
|
||||
#. Prepare the disk for replacement by using the ID of the OSD that was
|
||||
destroyed in previous steps:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph-volume lvm prepare --osd-id {id} --data /dev/sdX
|
||||
|
||||
#. And activate the OSD:
|
||||
#. Finally, activate the OSD:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph-volume lvm activate {id} {fsid}
|
||||
|
||||
Alternatively, instead of preparing and activating, the device can be recreated
|
||||
in one call, like:
|
||||
Alternatively, instead of carrying out the final two steps (preparing the disk
|
||||
and activating the OSD), you can re-create the OSD by running a single command
|
||||
of the following form:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph-volume lvm create --osd-id {id} --data /dev/sdX
|
||||
|
||||
|
||||
Starting the OSD
|
||||
----------------
|
||||
|
||||
After you add an OSD to Ceph, the OSD is in your configuration. However,
|
||||
it is not yet running. The OSD is ``down`` and ``in``. You must start
|
||||
your new OSD before it can begin receiving data. You may use
|
||||
``service ceph`` from your admin host or start the OSD from its host
|
||||
machine:
|
||||
After an OSD is added to Ceph, the OSD is in the cluster. However, until it is
|
||||
started, the OSD is considered ``down`` and ``in``. The OSD is not running and
|
||||
will be unable to receive data. To start an OSD, either run ``service ceph``
|
||||
from your admin host or run a command of the following form to start the OSD
|
||||
from its host machine:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
sudo systemctl start ceph-osd@{osd-num}
|
||||
|
||||
After the OSD is started, it is considered ``up`` and ``in``.
|
||||
|
||||
Once you start your OSD, it is ``up`` and ``in``.
|
||||
Observing the Data Migration
|
||||
----------------------------
|
||||
|
||||
|
||||
Observe the Data Migration
|
||||
--------------------------
|
||||
|
||||
Once you have added your new OSD to the CRUSH map, Ceph will begin rebalancing
|
||||
the server by migrating placement groups to your new OSD. You can observe this
|
||||
process with the `ceph`_ tool. :
|
||||
After the new OSD has been added to the CRUSH map, Ceph begins rebalancing the
|
||||
cluster by migrating placement groups (PGs) to the new OSD. To observe this
|
||||
process by using the `ceph`_ tool, run the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph -w
|
||||
|
||||
You should see the placement group states change from ``active+clean`` to
|
||||
``active, some degraded objects``, and finally ``active+clean`` when migration
|
||||
completes. (Control-c to exit.)
|
||||
Or:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
watch ceph status
|
||||
|
||||
The PG states will first change from ``active+clean`` to ``active, some
|
||||
degraded objects`` and then return to ``active+clean`` when migration
|
||||
completes. When you are finished observing, press Ctrl-C to exit.
|
||||
|
||||
.. _Add/Move an OSD: ../crush-map#addosd
|
||||
.. _ceph: ../monitoring
|
||||
|
||||
|
||||
|
||||
Removing OSDs (Manual)
|
||||
======================
|
||||
|
||||
When you want to reduce the size of a cluster or replace hardware, you may
|
||||
remove an OSD at runtime. With Ceph, an OSD is generally one Ceph ``ceph-osd``
|
||||
daemon for one storage drive within a host machine. If your host has multiple
|
||||
storage drives, you may need to remove one ``ceph-osd`` daemon for each drive.
|
||||
Generally, it's a good idea to check the capacity of your cluster to see if you
|
||||
are reaching the upper end of its capacity. Ensure that when you remove an OSD
|
||||
that your cluster is not at its ``near full`` ratio.
|
||||
It is possible to remove an OSD manually while the cluster is running: you
|
||||
might want to do this in order to reduce the size of the cluster or when
|
||||
replacing hardware. Typically, an OSD is a Ceph ``ceph-osd`` daemon running on
|
||||
one storage drive within a host machine. Alternatively, if your host machine
|
||||
has multiple storage drives, you might need to remove multiple ``ceph-osd``
|
||||
daemons: one daemon for each drive on the machine.
|
||||
|
||||
.. warning:: Do not let your cluster reach its ``full ratio`` when
|
||||
removing an OSD. Removing OSDs could cause the cluster to reach
|
||||
or exceed its ``full ratio``.
|
||||
.. warning:: Before you begin the process of removing an OSD, make sure that
|
||||
your cluster is not near its ``full ratio``. Otherwise the act of removing
|
||||
OSDs might cause the cluster to reach or exceed its ``full ratio``.
|
||||
|
||||
|
||||
Take the OSD out of the Cluster
|
||||
-----------------------------------
|
||||
Taking the OSD ``out`` of the Cluster
|
||||
-------------------------------------
|
||||
|
||||
Before you remove an OSD, it is usually ``up`` and ``in``. You need to take it
|
||||
out of the cluster so that Ceph can begin rebalancing and copying its data to
|
||||
other OSDs. :
|
||||
OSDs are typically ``up`` and ``in`` before they are removed from the cluster.
|
||||
Before the OSD can be removed from the cluster, the OSD must be taken ``out``
|
||||
of the cluster so that Ceph can begin rebalancing and copying its data to other
|
||||
OSDs. To take an OSD ``out`` of the cluster, run a command of the following
|
||||
form:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd out {osd-num}
|
||||
|
||||
|
||||
Observe the Data Migration
|
||||
--------------------------
|
||||
Observing the Data Migration
|
||||
----------------------------
|
||||
|
||||
Once you have taken your OSD ``out`` of the cluster, Ceph will begin
|
||||
rebalancing the cluster by migrating placement groups out of the OSD you
|
||||
removed. You can observe this process with the `ceph`_ tool. :
|
||||
After the OSD has been taken ``out`` of the cluster, Ceph begins rebalancing
|
||||
the cluster by migrating placement groups out of the OSD that was removed. To
|
||||
observe this process by using the `ceph`_ tool, run the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph -w
|
||||
|
||||
You should see the placement group states change from ``active+clean`` to
|
||||
``active, some degraded objects``, and finally ``active+clean`` when migration
|
||||
completes. (Control-c to exit.)
|
||||
The PG states will change from ``active+clean`` to ``active, some degraded
|
||||
objects`` and will then return to ``active+clean`` when migration completes.
|
||||
When you are finished observing, press Ctrl-C to exit.
|
||||
|
||||
.. note:: Sometimes, typically in a "small" cluster with few hosts (for
|
||||
instance with a small testing cluster), the fact to take ``out`` the
|
||||
OSD can spawn a CRUSH corner case where some PGs remain stuck in the
|
||||
``active+remapped`` state. If you are in this case, you should mark
|
||||
the OSD ``in`` with:
|
||||
.. note:: Under certain conditions, the action of taking ``out`` an OSD
|
||||
might lead CRUSH to encounter a corner case in which some PGs remain stuck
|
||||
in the ``active+remapped`` state. This problem sometimes occurs in small
|
||||
clusters with few hosts (for example, in a small testing cluster). To
|
||||
address this problem, mark the OSD ``in`` by running a command of the
|
||||
following form:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd in {osd-num}
|
||||
|
||||
to come back to the initial state and then, instead of marking ``out``
|
||||
the OSD, set its weight to 0 with:
|
||||
After the OSD has come back to its initial state, do not mark the OSD
|
||||
``out`` again. Instead, set the OSD's weight to ``0`` by running a command
|
||||
of the following form:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd crush reweight osd.{osd-num} 0
|
||||
|
||||
After that, you can observe the data migration which should come to its
|
||||
end. The difference between marking ``out`` the OSD and reweighting it
|
||||
to 0 is that in the first case the weight of the bucket which contains
|
||||
the OSD is not changed whereas in the second case the weight of the bucket
|
||||
is updated (and decreased of the OSD weight). The reweight command could
|
||||
be sometimes favoured in the case of a "small" cluster.
|
||||
|
||||
After the OSD has been reweighted, observe the data migration and confirm
|
||||
that it has completed successfully. The difference between marking an OSD
|
||||
``out`` and reweighting the OSD to ``0`` has to do with the bucket that
|
||||
contains the OSD. When an OSD is marked ``out``, the weight of the bucket is
|
||||
not changed. But when an OSD is reweighted to ``0``, the weight of the
|
||||
bucket is updated (namely, the weight of the OSD is subtracted from the
|
||||
overall weight of the bucket). When operating small clusters, it can
|
||||
sometimes be preferable to use the above reweight command.
|
||||
|
||||
|
||||
Stopping the OSD
|
||||
----------------
|
||||
|
||||
After you take an OSD out of the cluster, it may still be running.
|
||||
That is, the OSD may be ``up`` and ``out``. You must stop
|
||||
your OSD before you remove it from the configuration:
|
||||
After you take an OSD ``out`` of the cluster, the OSD might still be running.
|
||||
In such a case, the OSD is ``up`` and ``out``. Before it is removed from the
|
||||
cluster, the OSD must be stopped by running commands of the following form:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ssh {osd-host}
|
||||
sudo systemctl stop ceph-osd@{osd-num}
|
||||
|
||||
Once you stop your OSD, it is ``down``.
|
||||
After the OSD has been stopped, it is ``down``.
|
||||
|
||||
|
||||
Removing the OSD
|
||||
----------------
|
||||
|
||||
This procedure removes an OSD from a cluster map, removes its authentication
|
||||
key, removes the OSD from the OSD map, and removes the OSD from the
|
||||
``ceph.conf`` file. If your host has multiple drives, you may need to remove an
|
||||
OSD for each drive by repeating this procedure.
|
||||
The following procedure removes an OSD from the cluster map, removes the OSD's
|
||||
authentication key, removes the OSD from the OSD map, and removes the OSD from
|
||||
the ``ceph.conf`` file. If your host has multiple drives, it might be necessary
|
||||
to remove an OSD from each drive by repeating this procedure.
|
||||
|
||||
#. Let the cluster forget the OSD first. This step removes the OSD from the CRUSH
|
||||
map, removes its authentication key. And it is removed from the OSD map as
|
||||
well. Please note the :ref:`purge subcommand <ceph-admin-osd>` is introduced in Luminous, for older
|
||||
versions, please see below:
|
||||
#. Begin by having the cluster forget the OSD. This step removes the OSD from
|
||||
the CRUSH map, removes the OSD's authentication key, and removes the OSD
|
||||
from the OSD map. (The :ref:`purge subcommand <ceph-admin-osd>` was
|
||||
introduced in Luminous. For older releases, see :ref:`the procedure linked
|
||||
here <ceph_osd_purge_procedure_pre_luminous>`.):
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd purge {id} --yes-i-really-mean-it
|
||||
|
||||
#. Navigate to the host where you keep the master copy of the cluster's
|
||||
``ceph.conf`` file:
|
||||
|
||||
#. Navigate to the host where the master copy of the cluster's
|
||||
``ceph.conf`` file is kept:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
@ -338,46 +369,48 @@ OSD for each drive by repeating this procedure.
|
||||
cd /etc/ceph
|
||||
vim ceph.conf
|
||||
|
||||
#. Remove the OSD entry from your ``ceph.conf`` file (if it exists)::
|
||||
#. Remove the OSD entry from your ``ceph.conf`` file (if such an entry
|
||||
exists)::
|
||||
|
||||
[osd.1]
|
||||
host = {hostname}
|
||||
[osd.1]
|
||||
host = {hostname}
|
||||
|
||||
#. From the host where you keep the master copy of the cluster's ``ceph.conf``
|
||||
file, copy the updated ``ceph.conf`` file to the ``/etc/ceph`` directory of
|
||||
other hosts in your cluster.
|
||||
#. Copy the updated ``ceph.conf`` file from the location on the host where the
|
||||
master copy of the cluster's ``ceph.conf`` is kept to the ``/etc/ceph``
|
||||
directory of the other hosts in your cluster.
|
||||
|
||||
If your Ceph cluster is older than Luminous, instead of using ``ceph osd
|
||||
purge``, you need to perform this step manually:
|
||||
.. _ceph_osd_purge_procedure_pre_luminous:
|
||||
|
||||
If your Ceph cluster is older than Luminous, you will be unable to use the
|
||||
``ceph osd purge`` command. Instead, carry out the following procedure:
|
||||
|
||||
#. Remove the OSD from the CRUSH map so that it no longer receives data. You may
|
||||
also decompile the CRUSH map, remove the OSD from the device list, remove the
|
||||
device as an item in the host bucket or remove the host bucket (if it's in the
|
||||
CRUSH map and you intend to remove the host), recompile the map and set it.
|
||||
See `Remove an OSD`_ for details:
|
||||
#. Remove the OSD from the CRUSH map so that it no longer receives data (for
|
||||
more details, see `Remove an OSD`_):
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd crush remove {name}
|
||||
|
||||
Instead of removing the OSD from the CRUSH map, you might opt for one of two
|
||||
alternatives: (1) decompile the CRUSH map, remove the OSD from the device
|
||||
list, and remove the device from the host bucket; (2) remove the host bucket
|
||||
from the CRUSH map (provided that it is in the CRUSH map and that you intend
|
||||
to remove the host), recompile the map, and set it:
|
||||
|
||||
|
||||
#. Remove the OSD authentication key:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph auth del osd.{osd-num}
|
||||
|
||||
The value of ``ceph`` for ``ceph-{osd-num}`` in the path is the
|
||||
``$cluster-$id``. If your cluster name differs from ``ceph``, use your
|
||||
cluster name instead.
|
||||
|
||||
#. Remove the OSD:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd rm {osd-num}
|
||||
|
||||
for example:
|
||||
For example:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
|
@ -1,13 +1,12 @@
|
||||
===============
|
||||
Cache Tiering
|
||||
===============
|
||||
|
||||
.. warning:: Cache tiering has been deprecated in the Reef release as it
|
||||
has lacked a maintainer for a very long time. This does not mean
|
||||
it will be certainly removed, but we may choose to remove it
|
||||
without much further notice.
|
||||
|
||||
.. note:: Cache tiering is deprecated in Reef.
|
||||
|
||||
A cache tier provides Ceph Clients with better I/O performance for a subset of
|
||||
the data stored in a backing storage tier. Cache tiering involves creating a
|
||||
pool of relatively fast/expensive storage devices (e.g., solid state drives)
|
||||
|
@ -382,26 +382,17 @@ items within the host buckets::
|
||||
``rjenkins1`` algorithm. To select ``rjenkins1`` as the hash algorithm,
|
||||
enter ``0`` as your hash setting.
|
||||
|
||||
|
||||
.. _weightingbucketitems:
|
||||
|
||||
.. topic:: Weighting Bucket Items
|
||||
|
||||
Ceph expresses bucket weights as doubles, which allows for fine
|
||||
Ceph expresses bucket weights as doubles, which allows for fine-grained
|
||||
weighting. A weight is the relative difference between device capacities. We
|
||||
recommend using ``1.00`` as the relative weight for a 1TB storage device.
|
||||
In such a scenario, a weight of ``0.5`` would represent approximately 500GB,
|
||||
and a weight of ``3.00`` would represent approximately 3TB. Higher level
|
||||
buckets have a weight that is the sum total of the leaf items aggregated by
|
||||
the bucket.
|
||||
|
||||
A bucket item weight is one dimensional, but you may also calculate your
|
||||
item weights to reflect the performance of the storage drive. For example,
|
||||
if you have many 1TB drives where some have relatively low data transfer
|
||||
rate and the others have a relatively high data transfer rate, you may
|
||||
weight them differently, even though they have the same capacity (e.g.,
|
||||
a weight of 0.80 for the first set of drives with lower total throughput,
|
||||
and 1.20 for the second set of drives with higher total throughput).
|
||||
recommend using ``1.00`` as the relative weight for a 1 TB storage device.
|
||||
In such a scenario, a weight of ``0.50`` would represent approximately 500
|
||||
GB, and a weight of ``3.00`` would represent approximately 3 TB. Buckets
|
||||
higher in the CRUSH hierarchy have a weight that is the sum of the weight of
|
||||
the leaf items aggregated by the bucket.
|
||||
|
||||
|
||||
.. _crushmaprules:
|
||||
@ -409,164 +400,160 @@ items within the host buckets::
|
||||
CRUSH Map Rules
|
||||
---------------
|
||||
|
||||
CRUSH maps support the notion of 'CRUSH rules', which are the rules that
|
||||
determine data placement for a pool. The default CRUSH map has a rule for each
|
||||
pool. For large clusters, you will likely create many pools where each pool may
|
||||
have its own non-default CRUSH rule.
|
||||
|
||||
.. note:: In most cases, you will not need to modify the default rule. When
|
||||
you create a new pool, by default the rule will be set to ``0``.
|
||||
CRUSH maps have rules that include data placement for a pool: these are
|
||||
called "CRUSH rules". The default CRUSH map has one rule for each pool. If you
|
||||
are running a large cluster, you might create many pools and each of those
|
||||
pools might have its own non-default CRUSH rule.
|
||||
|
||||
|
||||
CRUSH rules define placement and replication strategies or distribution policies
|
||||
that allow you to specify exactly how CRUSH places object replicas. For
|
||||
example, you might create a rule selecting a pair of targets for 2-way
|
||||
mirroring, another rule for selecting three targets in two different data
|
||||
centers for 3-way mirroring, and yet another rule for erasure coding over six
|
||||
storage devices. For a detailed discussion of CRUSH rules, refer to
|
||||
`CRUSH - Controlled, Scalable, Decentralized Placement of Replicated Data`_,
|
||||
and more specifically to **Section 3.2**.
|
||||
.. note:: In most cases, there is no need to modify the default rule. When a
|
||||
new pool is created, by default the rule will be set to the value ``0``
|
||||
(which indicates the default CRUSH rule, which has the numeric ID ``0``).
|
||||
|
||||
CRUSH rules define policy that governs how data is distributed across the devices in
|
||||
the hierarchy. The rules define placement as well as replication strategies or
|
||||
distribution policies that allow you to specify exactly how CRUSH places data
|
||||
replicas. For example, you might create one rule selecting a pair of targets for
|
||||
two-way mirroring, another rule for selecting three targets in two different data
|
||||
centers for three-way replication, and yet another rule for erasure coding across
|
||||
six storage devices. For a detailed discussion of CRUSH rules, see **Section 3.2**
|
||||
of `CRUSH - Controlled, Scalable, Decentralized Placement of Replicated Data`_.
|
||||
|
||||
A rule takes the following form::
|
||||
|
||||
rule <rulename> {
|
||||
rule <rulename> {
|
||||
|
||||
id [a unique whole numeric ID]
|
||||
type [ replicated | erasure ]
|
||||
step take <bucket-name> [class <device-class>]
|
||||
step [choose|chooseleaf] [firstn|indep] <N> type <bucket-type>
|
||||
step emit
|
||||
}
|
||||
id [a unique integer ID]
|
||||
type [replicated|erasure]
|
||||
step take <bucket-name> [class <device-class>]
|
||||
step [choose|chooseleaf] [firstn|indep] <N> type <bucket-type>
|
||||
step emit
|
||||
}
|
||||
|
||||
|
||||
``id``
|
||||
|
||||
:Description: A unique whole number for identifying the rule.
|
||||
|
||||
:Purpose: A component of the rule mask.
|
||||
:Type: Integer
|
||||
:Required: Yes
|
||||
:Default: 0
|
||||
:Description: A unique integer that identifies the rule.
|
||||
:Purpose: A component of the rule mask.
|
||||
:Type: Integer
|
||||
:Required: Yes
|
||||
:Default: 0
|
||||
|
||||
|
||||
``type``
|
||||
|
||||
:Description: Describes a rule for either a storage drive (replicated)
|
||||
or a RAID.
|
||||
|
||||
:Purpose: A component of the rule mask.
|
||||
:Type: String
|
||||
:Required: Yes
|
||||
:Default: ``replicated``
|
||||
:Valid Values: Currently only ``replicated`` and ``erasure``
|
||||
:Description: Denotes the type of replication strategy to be enforced by the
|
||||
rule.
|
||||
:Purpose: A component of the rule mask.
|
||||
:Type: String
|
||||
:Required: Yes
|
||||
:Default: ``replicated``
|
||||
:Valid Values: ``replicated`` or ``erasure``
|
||||
|
||||
|
||||
``step take <bucket-name> [class <device-class>]``
|
||||
:Description: Takes a bucket name and iterates down the tree. If
|
||||
the ``device-class`` argument is specified, the argument must
|
||||
match a class assigned to OSDs within the cluster. Only
|
||||
devices belonging to the class are included.
|
||||
:Purpose: A component of the rule.
|
||||
:Required: Yes
|
||||
:Example: ``step take data``
|
||||
|
||||
:Description: Takes a bucket name, and begins iterating down the tree.
|
||||
If the ``device-class`` is specified, it must match
|
||||
a class previously used when defining a device. All
|
||||
devices that do not belong to the class are excluded.
|
||||
:Purpose: A component of the rule.
|
||||
:Required: Yes
|
||||
:Example: ``step take data``
|
||||
|
||||
|
||||
``step choose firstn {num} type {bucket-type}``
|
||||
:Description: Selects ``num`` buckets of the given type from within the
|
||||
current bucket. ``{num}`` is usually the number of replicas in
|
||||
the pool (in other words, the pool size).
|
||||
|
||||
:Description: Selects the number of buckets of the given type from within the
|
||||
current bucket. The number is usually the number of replicas in
|
||||
the pool (i.e., pool size).
|
||||
- If ``{num} == 0``, choose ``pool-num-replicas`` buckets (as many buckets as are available).
|
||||
- If ``pool-num-replicas > {num} > 0``, choose that many buckets.
|
||||
- If ``{num} < 0``, choose ``pool-num-replicas - {num}`` buckets.
|
||||
|
||||
- If ``{num} == 0``, choose ``pool-num-replicas`` buckets (all available).
|
||||
- If ``{num} > 0 && < pool-num-replicas``, choose that many buckets.
|
||||
- If ``{num} < 0``, it means ``pool-num-replicas - {num}``.
|
||||
|
||||
:Purpose: A component of the rule.
|
||||
:Prerequisite: Follows ``step take`` or ``step choose``.
|
||||
:Example: ``step choose firstn 1 type row``
|
||||
:Purpose: A component of the rule.
|
||||
:Prerequisite: Follows ``step take`` or ``step choose``.
|
||||
:Example: ``step choose firstn 1 type row``
|
||||
|
||||
|
||||
``step chooseleaf firstn {num} type {bucket-type}``
|
||||
:Description: Selects a set of buckets of the given type and chooses a leaf
|
||||
node (that is, an OSD) from the subtree of each bucket in that set of buckets. The
|
||||
number of buckets in the set is usually the number of replicas in
|
||||
the pool (in other words, the pool size).
|
||||
|
||||
:Description: Selects a set of buckets of ``{bucket-type}`` and chooses a leaf
|
||||
node (that is, an OSD) from the subtree of each bucket in the set of buckets.
|
||||
The number of buckets in the set is usually the number of replicas in
|
||||
the pool (i.e., pool size).
|
||||
|
||||
- If ``{num} == 0``, choose ``pool-num-replicas`` buckets (all available).
|
||||
- If ``{num} > 0 && < pool-num-replicas``, choose that many buckets.
|
||||
- If ``{num} < 0``, it means ``pool-num-replicas - {num}``.
|
||||
|
||||
:Purpose: A component of the rule. Usage removes the need to select a device using two steps.
|
||||
:Prerequisite: Follows ``step take`` or ``step choose``.
|
||||
:Example: ``step chooseleaf firstn 0 type row``
|
||||
- If ``{num} == 0``, choose ``pool-num-replicas`` buckets (as many buckets as are available).
|
||||
- If ``pool-num-replicas > {num} > 0``, choose that many buckets.
|
||||
- If ``{num} < 0``, choose ``pool-num-replicas - {num}`` buckets.
|
||||
:Purpose: A component of the rule. Using ``chooseleaf`` obviates the need to select a device in a separate step.
|
||||
:Prerequisite: Follows ``step take`` or ``step choose``.
|
||||
:Example: ``step chooseleaf firstn 0 type row``
|
||||
|
||||
|
||||
``step emit``
|
||||
:Description: Outputs the current value on the top of the stack and empties
|
||||
the stack. Typically used
|
||||
at the end of a rule, but may also be used to choose from different
|
||||
trees in the same rule.
|
||||
|
||||
:Description: Outputs the current value and empties the stack. Typically used
|
||||
at the end of a rule, but may also be used to pick from different
|
||||
trees in the same rule.
|
||||
:Purpose: A component of the rule.
|
||||
:Prerequisite: Follows ``step choose``.
|
||||
:Example: ``step emit``
|
||||
|
||||
:Purpose: A component of the rule.
|
||||
:Prerequisite: Follows ``step choose``.
|
||||
:Example: ``step emit``
|
||||
.. important:: A single CRUSH rule can be assigned to multiple pools, but
|
||||
a single pool cannot have multiple CRUSH rules.
|
||||
|
||||
.. important:: A given CRUSH rule may be assigned to multiple pools, but it
|
||||
is not possible for a single pool to have multiple CRUSH rules.
|
||||
``firstn`` or ``indep``
|
||||
|
||||
``firstn`` versus ``indep``
|
||||
:Description: Determines which replacement strategy CRUSH uses when items (OSDs)
|
||||
are marked ``down`` in the CRUSH map. When this rule is used
|
||||
with replicated pools, ``firstn`` is used. When this rule is
|
||||
used with erasure-coded pools, ``indep`` is used.
|
||||
|
||||
:Description: Controls the replacement strategy CRUSH uses when items (OSDs)
|
||||
are marked down in the CRUSH map. If this rule is to be used with
|
||||
replicated pools it should be ``firstn`` and if it's for
|
||||
erasure-coded pools it should be ``indep``.
|
||||
Suppose that a PG is stored on OSDs 1, 2, 3, 4, and 5 and then
|
||||
OSD 3 goes down.
|
||||
|
||||
The reason has to do with how they behave when a
|
||||
previously-selected device fails. Let's say you have a PG stored
|
||||
on OSDs 1, 2, 3, 4, 5. Then 3 goes down.
|
||||
|
||||
With the "firstn" mode, CRUSH simply adjusts its calculation to
|
||||
select 1 and 2, then selects 3 but discovers it's down, so it
|
||||
retries and selects 4 and 5, and then goes on to select a new
|
||||
OSD 6. So the final CRUSH mapping change is
|
||||
1, 2, 3, 4, 5 -> 1, 2, 4, 5, 6.
|
||||
When in ``firstn`` mode, CRUSH simply adjusts its calculation
|
||||
to select OSDs 1 and 2, then selects 3 and discovers that 3 is
|
||||
down, retries and selects 4 and 5, and finally goes on to
|
||||
select a new OSD: OSD 6. The final CRUSH mapping
|
||||
transformation is therefore 1, 2, 3, 4, 5 → 1, 2, 4, 5, 6.
|
||||
|
||||
However, if you were storing an erasure-coded pool, the above
|
||||
sequence would have changed the data that is mapped to OSDs 4,
|
||||
5, and 6. The ``indep`` mode attempts to avoid this unwanted
|
||||
consequence. When in ``indep`` mode, CRUSH can be expected to
|
||||
select 3, discover that 3 is down, retry, and select 6. The
|
||||
final CRUSH mapping transformation is therefore 1, 2, 3, 4, 5
|
||||
→ 1, 2, 6, 4, 5.
|
||||
|
||||
But if you're storing an EC pool, that means you just changed the
|
||||
data mapped to OSDs 4, 5, and 6! So the "indep" mode attempts to
|
||||
not do that. You can instead expect it, when it selects the failed
|
||||
OSD 3, to try again and pick out 6, for a final transformation of:
|
||||
1, 2, 3, 4, 5 -> 1, 2, 6, 4, 5
|
||||
|
||||
.. _crush-reclassify:
|
||||
|
||||
Migrating from a legacy SSD rule to device classes
|
||||
--------------------------------------------------
|
||||
|
||||
It used to be necessary to manually edit your CRUSH map and maintain a
|
||||
parallel hierarchy for each specialized device type (e.g., SSD) in order to
|
||||
write rules that apply to those devices. Since the Luminous release,
|
||||
the *device class* feature has enabled this transparently.
|
||||
Prior to the Luminous release's introduction of the *device class* feature, in
|
||||
order to write rules that applied to a specialized device type (for example,
|
||||
SSD), it was necessary to manually edit the CRUSH map and maintain a parallel
|
||||
hierarchy for each device type. The device class feature provides a more
|
||||
transparent way to achieve this end.
|
||||
|
||||
However, migrating from an existing, manually customized per-device map to
|
||||
the new device class rules in the trivial way will cause all data in the
|
||||
system to be reshuffled.
|
||||
However, if your cluster is migrated from an existing manually-customized
|
||||
per-device map to new device class-based rules, all data in the system will be
|
||||
reshuffled.
|
||||
|
||||
The ``crushtool`` has a few commands that can transform a legacy rule
|
||||
and hierarchy so that you can start using the new class-based rules.
|
||||
There are three types of transformations possible:
|
||||
The ``crushtool`` utility has several commands that can transform a legacy rule
|
||||
and hierarchy and allow you to start using the new device class rules. There
|
||||
are three possible types of transformation:
|
||||
|
||||
#. ``--reclassify-root <root-name> <device-class>``
|
||||
|
||||
This will take everything in the hierarchy beneath root-name and
|
||||
adjust any rules that reference that root via a ``take
|
||||
<root-name>`` to instead ``take <root-name> class <device-class>``.
|
||||
It renumbers the buckets in such a way that the old IDs are instead
|
||||
used for the specified class's "shadow tree" so that no data
|
||||
movement takes place.
|
||||
This command examines everything under ``root-name`` in the hierarchy and
|
||||
rewrites any rules that reference the specified root and that have the
|
||||
form ``take <root-name>`` so that they instead have the
|
||||
form ``take <root-name> class <device-class>``. The command also renumbers
|
||||
the buckets in such a way that the old IDs are used for the specified
|
||||
class's "shadow tree" and as a result no data movement takes place.
|
||||
|
||||
For example, imagine you have an existing rule like::
|
||||
For example, suppose you have the following as an existing rule::
|
||||
|
||||
rule replicated_rule {
|
||||
id 0
|
||||
@ -576,8 +563,8 @@ There are three types of transformations possible:
|
||||
step emit
|
||||
}
|
||||
|
||||
If you reclassify the root `default` as class `hdd`, the rule will
|
||||
become::
|
||||
If the root ``default`` is reclassified as class ``hdd``, the new rule will
|
||||
be as follows::
|
||||
|
||||
rule replicated_rule {
|
||||
id 0
|
||||
@ -589,23 +576,26 @@ There are three types of transformations possible:
|
||||
|
||||
#. ``--set-subtree-class <bucket-name> <device-class>``
|
||||
|
||||
This will mark every device in the subtree rooted at *bucket-name*
|
||||
This command marks every device in the subtree that is rooted at *bucket-name*
|
||||
with the specified device class.
|
||||
|
||||
This is normally used in conjunction with the ``--reclassify-root``
|
||||
option to ensure that all devices in that root are labeled with the
|
||||
correct class. In some situations, however, some of those devices
|
||||
(correctly) have a different class and we do not want to relabel
|
||||
them. In such cases, one can exclude the ``--set-subtree-class``
|
||||
option. This means that the remapping process will not be perfect,
|
||||
since the previous rule distributed across devices of multiple
|
||||
classes but the adjusted rules will only map to devices of the
|
||||
specified *device-class*, but that often is an accepted level of
|
||||
data movement when the number of outlier devices is small.
|
||||
This command is typically used in conjunction with the ``--reclassify-root`` option
|
||||
in order to ensure that all devices in that root are labeled with the
|
||||
correct class. In certain circumstances, however, some of those devices
|
||||
are correctly labeled with a different class and must not be relabeled. To
|
||||
manage this difficulty, one can exclude the ``--set-subtree-class``
|
||||
option. The remapping process will not be perfect, because the previous rule
|
||||
had an effect on devices of multiple classes but the adjusted rules will map
|
||||
only to devices of the specified device class. However, when there are not many
|
||||
outlier devices, the resulting level of data movement is often within tolerable
|
||||
limits.
|
||||
|
||||
|
||||
#. ``--reclassify-bucket <match-pattern> <device-class> <default-parent>``
|
||||
|
||||
This will allow you to merge a parallel type-specific hierarchy with the normal hierarchy. For example, many users have maps like::
|
||||
This command allows you to merge a parallel type-specific hierarchy with the
|
||||
normal hierarchy. For example, many users have maps that resemble the
|
||||
following::
|
||||
|
||||
host node1 {
|
||||
id -2 # do not change unnecessarily
|
||||
@ -627,7 +617,7 @@ There are three types of transformations possible:
|
||||
alg straw2
|
||||
hash 0 # rjenkins1
|
||||
item osd.80 weight 2.000
|
||||
...
|
||||
...
|
||||
}
|
||||
|
||||
root default {
|
||||
@ -644,42 +634,53 @@ There are three types of transformations possible:
|
||||
alg straw2
|
||||
hash 0 # rjenkins1
|
||||
item node1-ssd weight 2.000
|
||||
...
|
||||
...
|
||||
}
|
||||
|
||||
This function will reclassify each bucket that matches a
|
||||
pattern. The pattern can look like ``%suffix`` or ``prefix%``.
|
||||
For example, in the above example, we would use the pattern
|
||||
``%-ssd``. For each matched bucket, the remaining portion of the
|
||||
name (that matches the ``%`` wildcard) specifies the *base bucket*.
|
||||
All devices in the matched bucket are labeled with the specified
|
||||
device class and then moved to the base bucket. If the base bucket
|
||||
does not exist (e.g., ``node12-ssd`` exists but ``node12`` does
|
||||
not), then it is created and linked underneath the specified
|
||||
*default parent* bucket. In each case, we are careful to preserve
|
||||
the old bucket IDs for the new shadow buckets to prevent data
|
||||
movement. Any rules with ``take`` steps referencing the old
|
||||
buckets are adjusted.
|
||||
This command reclassifies each bucket that matches a certain
|
||||
pattern. The pattern can be of the form ``%suffix`` or ``prefix%``. For
|
||||
example, in the above example, we would use the pattern
|
||||
``%-ssd``. For each matched bucket, the remaining portion of the
|
||||
name (corresponding to the ``%`` wildcard) specifies the *base bucket*. All
|
||||
devices in the matched bucket are labeled with the specified
|
||||
device class and then moved to the base bucket. If the base bucket
|
||||
does not exist (for example, ``node12-ssd`` exists but ``node12`` does
|
||||
not), then it is created and linked under the specified
|
||||
*default parent* bucket. In each case, care is taken to preserve
|
||||
the old bucket IDs for the new shadow buckets in order to prevent data
|
||||
movement. Any rules with ``take`` steps that reference the old
|
||||
buckets are adjusted accordingly.
|
||||
|
||||
|
||||
#. ``--reclassify-bucket <bucket-name> <device-class> <base-bucket>``
|
||||
|
||||
The same command can also be used without a wildcard to map a
|
||||
single bucket. For example, in the previous example, we want the
|
||||
The same command can also be used without a wildcard in order to map a
|
||||
single bucket. For example, in the previous example, we want the
|
||||
``ssd`` bucket to be mapped to the ``default`` bucket.
|
||||
|
||||
The final command to convert the map comprising the above fragments would be something like:
|
||||
#. The final command to convert the map that consists of the above fragments
|
||||
resembles the following:
|
||||
|
||||
.. prompt:: bash $
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd getcrushmap -o original
|
||||
crushtool -i original --reclassify \
|
||||
--set-subtree-class default hdd \
|
||||
--reclassify-root default hdd \
|
||||
--reclassify-bucket %-ssd ssd default \
|
||||
--reclassify-bucket ssd ssd default \
|
||||
-o adjusted
|
||||
ceph osd getcrushmap -o original
|
||||
crushtool -i original --reclassify \
|
||||
--set-subtree-class default hdd \
|
||||
--reclassify-root default hdd \
|
||||
--reclassify-bucket %-ssd ssd default \
|
||||
--reclassify-bucket ssd ssd default \
|
||||
-o adjusted
|
||||
|
||||
In order to ensure that the conversion is correct, there is a ``--compare`` command that will test a large sample of inputs against the CRUSH map and check that the same result is output. These inputs are controlled by the same options that apply to the ``--test`` command. For the above example,:
|
||||
``--compare`` flag
|
||||
------------------
|
||||
|
||||
A ``--compare`` flag is available to make sure that the conversion performed in
|
||||
:ref:`Migrating from a legacy SSD rule to device classes <crush-reclassify>` is
|
||||
correct. This flag tests a large sample of inputs against the CRUSH map and
|
||||
checks that the expected result is output. The options that control these
|
||||
inputs are the same as the options that apply to the ``--test`` command. For an
|
||||
illustration of how this ``--compare`` command applies to the above example,
|
||||
see the following:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
@ -691,57 +692,55 @@ In order to ensure that the conversion is correct, there is a ``--compare`` comm
|
||||
rule 1 had 0/10240 mismatched mappings (0)
|
||||
maps appear equivalent
|
||||
|
||||
If there were differences, the ratio of remapped inputs would be reported in
|
||||
the parentheses.
|
||||
If the command finds any differences, the ratio of remapped inputs is reported
|
||||
in the parentheses.
|
||||
|
||||
When you are satisfied with the adjusted map, apply it to the cluster with a command of the form:
|
||||
When you are satisfied with the adjusted map, apply it to the cluster by
|
||||
running the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd setcrushmap -i adjusted
|
||||
|
||||
Tuning CRUSH, the hard way
|
||||
--------------------------
|
||||
Manually Tuning CRUSH
|
||||
---------------------
|
||||
|
||||
If you can ensure that all clients are running recent code, you can
|
||||
adjust the tunables by extracting the CRUSH map, modifying the values,
|
||||
and reinjecting it into the cluster.
|
||||
If you have verified that all clients are running recent code, you can adjust
|
||||
the CRUSH tunables by extracting the CRUSH map, modifying the values, and
|
||||
reinjecting the map into the cluster. The procedure is carried out as follows:
|
||||
|
||||
* Extract the latest CRUSH map:
|
||||
#. Extract the latest CRUSH map:
|
||||
|
||||
.. prompt:: bash $
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd getcrushmap -o /tmp/crush
|
||||
ceph osd getcrushmap -o /tmp/crush
|
||||
|
||||
* Adjust tunables. These values appear to offer the best behavior
|
||||
for both large and small clusters we tested with. You will need to
|
||||
additionally specify the ``--enable-unsafe-tunables`` argument to
|
||||
``crushtool`` for this to work. Please use this option with
|
||||
extreme care.:
|
||||
#. Adjust tunables. In our tests, the following values appear to result in the
|
||||
best behavior for both large and small clusters. The procedure requires that
|
||||
you specify the ``--enable-unsafe-tunables`` flag in the ``crushtool``
|
||||
command. Use this option with **extreme care**:
|
||||
|
||||
.. prompt:: bash $
|
||||
.. prompt:: bash $
|
||||
|
||||
crushtool -i /tmp/crush --set-choose-local-tries 0 --set-choose-local-fallback-tries 0 --set-choose-total-tries 50 -o /tmp/crush.new
|
||||
crushtool -i /tmp/crush --set-choose-local-tries 0 --set-choose-local-fallback-tries 0 --set-choose-total-tries 50 -o /tmp/crush.new
|
||||
|
||||
* Reinject modified map:
|
||||
#. Reinject the modified map:
|
||||
|
||||
.. prompt:: bash $
|
||||
.. prompt:: bash $
|
||||
|
||||
ceph osd setcrushmap -i /tmp/crush.new
|
||||
|
||||
Legacy values
|
||||
-------------
|
||||
|
||||
For reference, the legacy values for the CRUSH tunables can be set
|
||||
with:
|
||||
To set the legacy values of the CRUSH tunables, run the following command:
|
||||
|
||||
.. prompt:: bash $
|
||||
|
||||
crushtool -i /tmp/crush --set-choose-local-tries 2 --set-choose-local-fallback-tries 5 --set-choose-total-tries 19 --set-chooseleaf-descend-once 0 --set-chooseleaf-vary-r 0 -o /tmp/crush.legacy
|
||||
|
||||
Again, the special ``--enable-unsafe-tunables`` option is required.
|
||||
Further, as noted above, be careful running old versions of the
|
||||
``ceph-osd`` daemon after reverting to legacy values as the feature
|
||||
bit is not perfectly enforced.
|
||||
The special ``--enable-unsafe-tunables`` flag is required. Be careful when
|
||||
running old versions of the ``ceph-osd`` daemon after reverting to legacy
|
||||
values, because the feature bit is not perfectly enforced.
|
||||
|
||||
.. _CRUSH - Controlled, Scalable, Decentralized Placement of Replicated Data: https://ceph.io/assets/pdfs/weil-crush-sc06.pdf
|
||||
|
@ -221,6 +221,8 @@ To view the contents of the rules, run the following command:
|
||||
|
||||
ceph osd crush rule dump
|
||||
|
||||
.. _device_classes:
|
||||
|
||||
Device classes
|
||||
--------------
|
||||
|
||||
|
@ -236,7 +236,7 @@ mode. As a result, however, pools with lost OSDs but without complete data loss
|
||||
unable to recover and go active without manual intervention to temporarily change
|
||||
the ``min_size`` setting.
|
||||
|
||||
We recommend that ``min_size`` be ``K+2`` or greater to prevent loss of writes and
|
||||
We recommend that ``min_size`` be ``K+1`` or greater to prevent loss of writes and
|
||||
loss of data.
|
||||
|
||||
|
||||
|
@ -222,7 +222,7 @@ What if the state is ``probing``?
|
||||
single-monitor cluster (never do this in production), the monitor will pass
|
||||
through the probing state almost instantaneously. In a multi-monitor
|
||||
cluster, the monitors will stay in this state until they find enough monitors
|
||||
to form a quorum -- this means that if you have 2 out of 3 monitors down, the
|
||||
to form a quorum |---| this means that if you have 2 out of 3 monitors down, the
|
||||
one remaining monitor will stay in this state indefinitely until you bring
|
||||
one of the other monitors up.
|
||||
|
||||
@ -636,3 +636,6 @@ Finally, you should reach out to us on the mailing lists, on IRC or file
|
||||
a new issue on the `tracker`_.
|
||||
|
||||
.. _tracker: http://tracker.ceph.com/projects/ceph/issues/new
|
||||
|
||||
.. |---| unicode:: U+2014 .. EM DASH
|
||||
:trim:
|
||||
|
@ -484,8 +484,8 @@ Every Object Gateway tracks per user and bucket metrics separately, these metric
|
||||
That means that the desired limits configured should be divide by the number of active Object Gateways.
|
||||
For example, if userA should be limited by 10 ops per minute and there are 2 Object Gateways in the cluster,
|
||||
the limit over userA should be 5 (10 ops per minute / 2 RGWs).
|
||||
if the requests are ``not`` balanced between RGWs, the rate limit may be underutilized.
|
||||
For example, if the ops limit is 5 and there are 2 RGWs, ``but`` the Load Balancer send load only to one of those RGWs,
|
||||
If the requests are **not** balanced between RGWs, the rate limit may be underutilized.
|
||||
For example, if the ops limit is 5 and there are 2 RGWs, **but** the Load Balancer send load only to one of those RGWs,
|
||||
The effective limit would be 5 ops, because this limit is enforced per RGW.
|
||||
If there is a limit reached for bucket not for user or vice versa the request would be cancelled as well.
|
||||
The bandwidth counting happens after the request is being accepted, as a result, even if in the middle of the request the bucket/user has reached its bandwidth limit this request will proceed.
|
||||
@ -652,10 +652,12 @@ user usage within date ranges too.
|
||||
Options include:
|
||||
|
||||
- **Start Date:** The ``--start-date`` option allows you to filter usage
|
||||
stats from a particular start date (**format:** ``yyyy-mm-dd[HH:MM:SS]``).
|
||||
stats from a particular start date and an optional start time
|
||||
(**format:** ``yyyy-mm-dd [HH:MM:SS]``).
|
||||
|
||||
- **End Date:** The ``--end-date`` option allows you to filter usage up
|
||||
to a particular date (**format:** ``yyyy-mm-dd[HH:MM:SS]``).
|
||||
to a particular date and an optional end time
|
||||
(**format:** ``yyyy-mm-dd [HH:MM:SS]``).
|
||||
|
||||
- **Log Entries:** The ``--show-log-entries`` option allows you to specify
|
||||
whether or not to include log entries with the usage stats
|
||||
|
@ -7,6 +7,9 @@ Compression
|
||||
The Ceph Object Gateway supports server-side compression of uploaded objects,
|
||||
using any of Ceph's existing compression plugins.
|
||||
|
||||
.. note:: The Reef release added a :ref:`feature_compress_encrypted` zonegroup
|
||||
feature to enable compression with `Server-Side Encryption`_.
|
||||
|
||||
|
||||
Configuration
|
||||
=============
|
||||
@ -84,4 +87,5 @@ The ``size_utilized`` and ``size_kb_utilized`` fields represent the total
|
||||
size of compressed data, in bytes and kilobytes respectively.
|
||||
|
||||
|
||||
.. _`Server-Side Encryption`: ../encryption
|
||||
.. _`Multisite Configuration`: ../multisite
|
||||
|
@ -40,10 +40,15 @@ To upload a script:
|
||||
|
||||
::
|
||||
|
||||
# radosgw-admin script put --infile={lua-file} --context={prerequest|postrequest|background|getdata|putdata} [--tenant={tenant-name}]
|
||||
# radosgw-admin script put --infile={lua-file-path} --context={prerequest|postrequest|background|getdata|putdata} [--tenant={tenant-name}]
|
||||
|
||||
|
||||
* When uploading a script with the ``background`` context, a tenant name may not be specified.
|
||||
* When uploading a script with the ``background`` context, a tenant name should not be specified.
|
||||
* When uploading a script into a cluster deployed with cephadm, use the following command:
|
||||
|
||||
::
|
||||
|
||||
# cephadm shell radosgw-admin script put --infile=/rootfs/{lua-file-path} --context={prerequest|postrequest|background|getdata|putdata} [--tenant={tenant-name}]
|
||||
|
||||
|
||||
To print the content of the script to standard output:
|
||||
|
@ -46,6 +46,24 @@ configurations for the Ceph Object Gateway:
|
||||
a global object namespace. This global object namespace ensures unique
|
||||
object IDs across zonegroups and zones.
|
||||
|
||||
Each bucket is owned by the zonegroup where it was created (except where
|
||||
overridden by the :ref:`LocationConstraint<s3_bucket_placement>` on
|
||||
bucket creation), and its object data will only replicate to other zones in
|
||||
that zonegroup. Any request for data in that bucket that are sent to other
|
||||
zonegroups will redirect to the zonegroup where the bucket resides.
|
||||
|
||||
It can be useful to create multiple zonegroups when you want to share a
|
||||
namespace of users and buckets across many zones, but isolate the object data
|
||||
to a subset of those zones. It might be that you have several connected sites
|
||||
that share storage, but only require a single backup for purposes of disaster
|
||||
recovery. In such a case, it could make sense to create several zonegroups
|
||||
with only two zones each to avoid replicating all objects to all zones.
|
||||
|
||||
In other cases, it might make more sense to isolate things in separate
|
||||
realms, with each realm having a single zonegroup. Zonegroups provide
|
||||
flexibility by making it possible to control the isolation of data and
|
||||
metadata separately.
|
||||
|
||||
- **Multiple Realms:** Beginning with the Kraken release, the Ceph Object
|
||||
Gateway supports "realms", which are containers for zonegroups. Realms make
|
||||
it possible to set policies that apply to multiple zonegroups. Realms have a
|
||||
@ -55,6 +73,7 @@ configurations for the Ceph Object Gateway:
|
||||
realm can have a configuration that is distinct from the configuration of
|
||||
other realms).
|
||||
|
||||
|
||||
Diagram - Replication of Object Data Between Zones
|
||||
--------------------------------------------------
|
||||
|
||||
@ -1344,7 +1363,7 @@ Zones
|
||||
-----
|
||||
|
||||
A zone defines a logical group that consists of one or more Ceph Object Gateway
|
||||
instances. Ceph Object Gateway supports zones.
|
||||
instances. All RGWs in a given zone serve S3 objects that are backed by RADOS objects that are stored in the same set of pools in the same cluster. Ceph Object Gateway supports zones.
|
||||
|
||||
The procedure for configuring zones differs from typical configuration
|
||||
procedures, because not all of the settings end up in a Ceph configuration
|
||||
@ -1574,15 +1593,17 @@ On creation of new zones and zonegroups, all known features are supported/enable
|
||||
Supported Features
|
||||
------------------
|
||||
|
||||
+---------------------------+---------+
|
||||
| Feature | Release |
|
||||
+===========================+=========+
|
||||
| :ref:`feature_resharding` | Reef |
|
||||
+---------------------------+---------+
|
||||
+-----------------------------------+---------+----------+
|
||||
| Feature | Release | Default |
|
||||
+===================================+=========+==========+
|
||||
| :ref:`feature_resharding` | Reef | Enabled |
|
||||
+-----------------------------------+---------+----------+
|
||||
| :ref:`feature_compress_encrypted` | Reef | Disabled |
|
||||
+-----------------------------------+---------+----------+
|
||||
|
||||
.. _feature_resharding:
|
||||
|
||||
Resharding
|
||||
resharding
|
||||
~~~~~~~~~~
|
||||
|
||||
This feature allows buckets to be resharded in a multisite configuration
|
||||
@ -1597,6 +1618,21 @@ of its RGWs and OSDs have upgraded.
|
||||
the Reef release.
|
||||
|
||||
|
||||
.. _feature_compress_encrypted:
|
||||
|
||||
compress-encrypted
|
||||
~~~~~~~~~~~~~~~~~~
|
||||
|
||||
This feature enables support for combining `Server-Side Encryption`_ and
|
||||
`Compression`_ on the same object. Object data gets compressed before encryption.
|
||||
Prior to Reef, multisite would not replicate such objects correctly, so all zones
|
||||
must upgrade to Reef or later before enabling.
|
||||
|
||||
.. warning:: The compression ratio may leak information about the encrypted data,
|
||||
and allow attackers to distinguish whether two same-sized objects might contain
|
||||
the same data. Due to these security considerations, this feature is disabled
|
||||
by default.
|
||||
|
||||
Commands
|
||||
--------
|
||||
|
||||
@ -1644,3 +1680,5 @@ On any cluster in the realm:
|
||||
|
||||
.. _`Pools`: ../pools
|
||||
.. _`Sync Policy Config`: ../multisite-sync-policy
|
||||
.. _`Server-Side Encryption`: ../encryption
|
||||
.. _`Compression`: ../compression
|
||||
|
@ -7,51 +7,54 @@
|
||||
Overview
|
||||
--------
|
||||
|
||||
The purpose of the **s3 select** engine is to create an efficient pipe between
|
||||
user client and storage nodes (the engine should be close as possible to
|
||||
storage). It enables the selection of a restricted subset of (structured) data
|
||||
stored in an S3 object using an SQL-like syntax. It also enables for higher
|
||||
level analytic-applications (such as SPARK-SQL), using that feature to improve
|
||||
their latency and throughput.
|
||||
The **S3 Select** engine creates an efficient pipe between clients and Ceph
|
||||
back end nodes. The S3 Select engine works best when implemented as closely as
|
||||
possible to back end storage.
|
||||
|
||||
For example, an s3-object of several GB (CSV file), a user needs to extract a
|
||||
single column filtered by another column. As the following query: ``select
|
||||
customer-id from s3Object where age>30 and age<65;``
|
||||
The S3 Select engine makes it possible to use an SQL-like syntax to select a
|
||||
restricted subset of data stored in an S3 object. The S3 Select engine
|
||||
facilitates the use of higher level, analytic applications (for example:
|
||||
SPARK-SQL). The ability of the S3 Select engine to target a proper subset of
|
||||
structed data within an S3 object decreases latency and increases throughput.
|
||||
|
||||
Currently the whole s3-object must be retrieved from OSD via RGW before
|
||||
filtering and extracting data. By "pushing down" the query into radosgw, it's
|
||||
possible to save a lot of network and CPU(serialization / deserialization).
|
||||
For example: assume that a user needs to extract a single column that is
|
||||
filtered by another column, and that these colums are stored in a CSV file in
|
||||
an S3 object that is several GB in size. The following query performs this
|
||||
extraction: ``select customer-id from s3Object where age>30 and age<65;``
|
||||
|
||||
**The bigger the object, and the more accurate the query, the better the
|
||||
performance**.
|
||||
Without the use of S3 Select, the whole S3 object must be retrieved from an OSD
|
||||
via RGW before the data is filtered and extracted. Significant network and CPU
|
||||
overhead are saved by "pushing down" the query into radosgw.
|
||||
|
||||
**The bigger the object and the more accurate the query,
|
||||
the better the performance of s3select**.
|
||||
|
||||
Basic Workflow
|
||||
--------------
|
||||
|
||||
S3-select query is sent to RGW via `AWS-CLI
|
||||
S3 Select queries are sent to RGW via `AWS-CLI
|
||||
<https://docs.aws.amazon.com/cli/latest/reference/s3api/select-object-content.html>`_
|
||||
|
||||
It passes the authentication and permission process as an incoming message
|
||||
(POST). **RGWSelectObj_ObjStore_S3::send_response_data** is the “entry point”,
|
||||
it handles each fetched chunk according to input object-key.
|
||||
**send_response_data** is first handling the input query, it extracts the query
|
||||
and other CLI parameters.
|
||||
S3 Select passes the authentication and permission parameters as an incoming
|
||||
message (POST). ``RGWSelectObj_ObjStore_S3::send_response_data`` is the entry
|
||||
point and handles each fetched chunk according to the object key that was
|
||||
input. ``send_response_data`` is the first to handle the input query: it
|
||||
extracts the query and other CLI parameters.
|
||||
|
||||
Per each new fetched chunk (~4m), RGW executes an s3-select query on it. The
|
||||
current implementation supports CSV objects and since chunks are randomly
|
||||
“cutting” the CSV rows in the middle, those broken-lines (first or last per
|
||||
chunk) are skipped while processing the query. Those “broken” lines are
|
||||
stored and later merged with the next broken-line (belong to the next chunk),
|
||||
and finally processed.
|
||||
|
||||
Per each processed chunk an output message is formatted according to `AWS
|
||||
RGW executes an S3 Select query on each new fetched chunk (up to 4 MB). The
|
||||
current implementation supports CSV objects. CSV rows are sometimes "cut" in
|
||||
the middle by the limits of the chunks, and those broken-lines (the first or
|
||||
last per chunk) are skipped while processing the query. Such broken lines are
|
||||
stored and later merged with the next broken line (which belongs to the next
|
||||
chunk), and only then processed.
|
||||
|
||||
For each processed chunk, an output message is formatted according to `aws
|
||||
specification
|
||||
<https://docs.aws.amazon.com/AmazonS3/latest/API/archive-RESTObjectSELECTContent.html#archive-RESTObjectSELECTContent-responses>`_
|
||||
and sent back to the client. RGW supports the following response:
|
||||
<https://docs.aws.amazon.com/amazons3/latest/api/archive-restobjectselectcontent.html#archive-restobjectselectcontent-responses>`_
|
||||
and sent back to the client. RGW supports the following response:
|
||||
``{:event-type,records} {:content-type,application/octet-stream}
|
||||
{:message-type,event}``. For aggregation queries the last chunk should be
|
||||
identified as the end of input, following that the s3-select-engine initiates
|
||||
end-of-process and produces an aggregated result.
|
||||
{:message-type,event}``. For aggregation queries, the last chunk should be
|
||||
identified as the end of input.
|
||||
|
||||
|
||||
Basic Functionalities
|
||||
|
@ -477,10 +477,7 @@ function populate_wheelhouse() {
|
||||
pip $PIP_OPTS $install \
|
||||
'setuptools >= 0.8' 'pip >= 21.0' 'wheel >= 0.24' 'tox >= 2.9.1' || return 1
|
||||
if test $# != 0 ; then
|
||||
# '--use-feature=fast-deps --use-deprecated=legacy-resolver' added per
|
||||
# https://github.com/pypa/pip/issues/9818 These should be able to be
|
||||
# removed at some point in the future.
|
||||
pip --use-feature=fast-deps --use-deprecated=legacy-resolver $PIP_OPTS $install $@ || return 1
|
||||
pip $PIP_OPTS $install $@ || return 1
|
||||
fi
|
||||
}
|
||||
|
||||
|
2
ceph/qa/distros/all/centos_9.stream.yaml
Normal file
2
ceph/qa/distros/all/centos_9.stream.yaml
Normal file
@ -0,0 +1,2 @@
|
||||
os_type: centos
|
||||
os_version: "9.stream"
|
1
ceph/qa/distros/all/centos_latest.yaml
Symbolic link
1
ceph/qa/distros/all/centos_latest.yaml
Symbolic link
@ -0,0 +1 @@
|
||||
centos_9.stream.yaml
|
1
ceph/qa/distros/supported-random-distro$/centos_latest.yaml
Symbolic link
1
ceph/qa/distros/supported-random-distro$/centos_latest.yaml
Symbolic link
@ -0,0 +1 @@
|
||||
../all/centos_latest.yaml
|
1
ceph/qa/distros/supported/centos_8.stream.yaml
Symbolic link
1
ceph/qa/distros/supported/centos_8.stream.yaml
Symbolic link
@ -0,0 +1 @@
|
||||
../all/centos_8.stream.yaml
|
@ -1 +1 @@
|
||||
../all/centos_8.yaml
|
||||
../all/centos_latest.yaml
|
@ -8,9 +8,13 @@ overrides:
|
||||
- \(OSD_
|
||||
- \(OBJECT_
|
||||
- \(POOL_APP_NOT_ENABLED\)
|
||||
install:
|
||||
ceph:
|
||||
extra_system_packages:
|
||||
- python3-pytest
|
||||
tasks:
|
||||
- workunit:
|
||||
timeout: 1h
|
||||
clients:
|
||||
client.0:
|
||||
- rados/test_python.sh --eval-attr 'not (wait or tier or ec or bench or stats)'
|
||||
- rados/test_python.sh -m 'not (wait or tier or ec or bench or stats)'
|
||||
|
@ -3,10 +3,14 @@ overrides:
|
||||
log-ignorelist:
|
||||
- \(SLOW_OPS\)
|
||||
- slow request
|
||||
install:
|
||||
ceph:
|
||||
extra_system_packages:
|
||||
- python3-pytest
|
||||
tasks:
|
||||
- workunit:
|
||||
clients:
|
||||
client.0:
|
||||
- rbd/test_librbd_python.sh --eval-attr 'not (SKIP_IF_CRIMSON)'
|
||||
- rbd/test_librbd_python.sh -m 'not skip_if_crimson'
|
||||
env:
|
||||
RBD_FEATURES: "61"
|
||||
|
@ -3,8 +3,12 @@ overrides:
|
||||
log-ignorelist:
|
||||
- \(SLOW_OPS\)
|
||||
- slow request
|
||||
install:
|
||||
ceph:
|
||||
extra_system_packages:
|
||||
- python3-pytest
|
||||
tasks:
|
||||
- workunit:
|
||||
clients:
|
||||
client.0:
|
||||
- rbd/test_librbd_python.sh --eval-attr 'not (SKIP_IF_CRIMSON)'
|
||||
- rbd/test_librbd_python.sh -m 'not skip_if_crimson'
|
||||
|
@ -3,6 +3,13 @@ overrides:
|
||||
disabled: true
|
||||
kclient:
|
||||
disabled: true
|
||||
install:
|
||||
ceph:
|
||||
extra_system_packages:
|
||||
deb:
|
||||
- python3-pytest
|
||||
rpm:
|
||||
- python3-pytest
|
||||
tasks:
|
||||
- workunit:
|
||||
clients:
|
||||
|
@ -11,3 +11,4 @@ overrides:
|
||||
- has not responded to cap revoke by MDS for over
|
||||
- MDS_CLIENT_LATE_RELEASE
|
||||
- responding to mclientcaps
|
||||
- RECENT_CRASH
|
||||
|
@ -0,0 +1 @@
|
||||
.qa/distros/supported/centos_8.stream.yaml
|
@ -1 +0,0 @@
|
||||
.qa/distros/supported/centos_latest.yaml
|
@ -0,0 +1 @@
|
||||
.qa/distros/supported/centos_8.stream.yaml
|
@ -1 +0,0 @@
|
||||
.qa/distros/supported/centos_latest.yaml
|
1
ceph/qa/suites/fs/upgrade/nofs/centos_8.yaml
Symbolic link
1
ceph/qa/suites/fs/upgrade/nofs/centos_8.yaml
Symbolic link
@ -0,0 +1 @@
|
||||
.qa/distros/supported/centos_8.stream.yaml
|
@ -1 +0,0 @@
|
||||
.qa/distros/supported/centos_latest.yaml
|
1
ceph/qa/suites/fs/upgrade/upgraded_client/centos_8.yaml
Symbolic link
1
ceph/qa/suites/fs/upgrade/upgraded_client/centos_8.yaml
Symbolic link
@ -0,0 +1 @@
|
||||
.qa/distros/supported/centos_8.stream.yaml
|
@ -1 +0,0 @@
|
||||
.qa/distros/supported/centos_latest.yaml
|
@ -8,6 +8,8 @@ tasks:
|
||||
- radosgw-admin zone create --rgw-zonegroup=default --rgw-zone=z --master --default
|
||||
- radosgw-admin period update --rgw-realm=r --commit
|
||||
- ceph orch apply rgw foo --realm r --zone z --placement=2 --port=8000
|
||||
# simple rgw spec (will have no "spec" field) to make sure that works with rgw spec migration
|
||||
- ceph orch apply rgw smpl
|
||||
# setup iscsi
|
||||
- ceph osd pool create foo
|
||||
- rbd pool init foo
|
||||
|
@ -2,6 +2,7 @@ overrides:
|
||||
ceph:
|
||||
log-ignorelist:
|
||||
- \(PG_AVAILABILITY\)
|
||||
- \(POOL_APP_NOT_ENABLED\)
|
||||
conf:
|
||||
osd:
|
||||
osd_class_load_list: "*"
|
||||
|
@ -8,6 +8,13 @@ overrides:
|
||||
- \(OSD_
|
||||
- \(OBJECT_
|
||||
- \(POOL_APP_NOT_ENABLED\)
|
||||
install:
|
||||
ceph:
|
||||
extra_system_packages:
|
||||
rpm:
|
||||
- python3-pytest
|
||||
deb:
|
||||
- python3-pytest
|
||||
tasks:
|
||||
- workunit:
|
||||
timeout: 1h
|
||||
|
@ -1 +0,0 @@
|
||||
../.qa
|
@ -1 +0,0 @@
|
||||
../.qa/distros/supported/centos_latest.yaml
|
@ -1 +0,0 @@
|
||||
../.qa/distros/supported/rhel_latest.yaml
|
@ -1,22 +0,0 @@
|
||||
overrides:
|
||||
ceph:
|
||||
fs: ext4
|
||||
conf:
|
||||
global:
|
||||
osd max object name len: 460
|
||||
osd max object namespace len: 64
|
||||
osd client message cap: 5000
|
||||
roles:
|
||||
- [mon.a, mgr.x, osd.0, osd.1, osd.2, client.0]
|
||||
tasks:
|
||||
- install:
|
||||
- ceph:
|
||||
pre-mgr-commands:
|
||||
- sudo ceph config set mgr mgr_pool false --force
|
||||
log-ignorelist:
|
||||
- overall HEALTH_
|
||||
- \(POOL_APP_NOT_ENABLED\)
|
||||
- workunit:
|
||||
clients:
|
||||
all:
|
||||
- rados/test_envlibrados_for_rocksdb.sh
|
@ -3,6 +3,10 @@ overrides:
|
||||
log-ignorelist:
|
||||
- \(SLOW_OPS\)
|
||||
- slow request
|
||||
install:
|
||||
ceph:
|
||||
extra_system_packages:
|
||||
- python3-pytest
|
||||
tasks:
|
||||
- workunit:
|
||||
clients:
|
||||
|
@ -1,3 +1,8 @@
|
||||
overrides:
|
||||
install:
|
||||
ceph:
|
||||
extra_system_packages:
|
||||
- python3-pytest
|
||||
tasks:
|
||||
- workunit:
|
||||
clients:
|
||||
|
@ -1,3 +1,8 @@
|
||||
overrides:
|
||||
install:
|
||||
ceph:
|
||||
extra_system_packages:
|
||||
- python3-pytest
|
||||
tasks:
|
||||
- workunit:
|
||||
clients:
|
||||
|
@ -1,3 +1,8 @@
|
||||
overrides:
|
||||
install:
|
||||
ceph:
|
||||
extra_system_packages:
|
||||
- python3-pytest
|
||||
tasks:
|
||||
- workunit:
|
||||
clients:
|
||||
|
@ -1,3 +1,8 @@
|
||||
overrides:
|
||||
install:
|
||||
ceph:
|
||||
extra_system_packages:
|
||||
- python3-pytest
|
||||
tasks:
|
||||
- workunit:
|
||||
clients:
|
||||
|
@ -1,3 +1,8 @@
|
||||
overrides:
|
||||
install:
|
||||
ceph:
|
||||
extra_system_packages:
|
||||
- python3-pytest
|
||||
tasks:
|
||||
- workunit:
|
||||
clients:
|
||||
|
@ -1,3 +1,8 @@
|
||||
overrides:
|
||||
install:
|
||||
ceph:
|
||||
extra_system_packages:
|
||||
- python3-pytest
|
||||
tasks:
|
||||
- workunit:
|
||||
clients:
|
||||
|
@ -4,7 +4,7 @@ tasks:
|
||||
- tox: [ client.0 ]
|
||||
- keystone:
|
||||
client.0:
|
||||
force-branch: stable/xena
|
||||
force-branch: stable/2023.1
|
||||
services:
|
||||
- name: swift
|
||||
type: object-store
|
||||
@ -15,7 +15,7 @@ tasks:
|
||||
use-keystone-role: client.0
|
||||
- tempest:
|
||||
client.0:
|
||||
sha1: 30.0.0
|
||||
sha1: 34.1.0
|
||||
force-branch: master
|
||||
use-keystone-role: client.0
|
||||
auth:
|
||||
@ -35,6 +35,8 @@ tasks:
|
||||
object-storage-feature-enabled:
|
||||
container_sync: false
|
||||
discoverability: true
|
||||
# TODO(tobias-urdin): Use sha256 when supported in RadosGW
|
||||
tempurl_digest_hashlib: sha1
|
||||
blocklist:
|
||||
- .*test_account_quotas_negative.AccountQuotasNegativeTest.test_user_modify_quota
|
||||
- .*test_container_acl_negative.ObjectACLsNegativeTest.*
|
||||
@ -48,6 +50,7 @@ tasks:
|
||||
- .*test_container_services.ContainerTest.test_create_container_with_remove_metadata_value
|
||||
- .*test_object_expiry.ObjectExpiryTest.test_get_object_after_expiry_time
|
||||
- .*test_object_expiry.ObjectExpiryTest.test_get_object_at_expiry_time
|
||||
- .*test_account_services.AccountTest.test_list_no_account_metadata
|
||||
|
||||
overrides:
|
||||
ceph:
|
||||
@ -57,7 +60,7 @@ overrides:
|
||||
osd_max_pg_log_entries: 10
|
||||
client:
|
||||
rgw keystone api version: 3
|
||||
rgw keystone accepted roles: admin,Member
|
||||
rgw keystone accepted roles: admin,member
|
||||
rgw keystone implicit tenants: true
|
||||
rgw keystone accepted admin roles: admin
|
||||
rgw swift enforce content length: true
|
||||
|
@ -0,0 +1 @@
|
||||
.qa/distros/supported/centos_8.stream.yaml
|
@ -1 +0,0 @@
|
||||
.qa/distros/supported/centos_latest.yaml
|
5
ceph/qa/suites/rgw/verify/tasks/versioning.yaml
Normal file
5
ceph/qa/suites/rgw/verify/tasks/versioning.yaml
Normal file
@ -0,0 +1,5 @@
|
||||
tasks:
|
||||
- workunit:
|
||||
clients:
|
||||
client.0:
|
||||
- rgw/run-versioning.sh
|
@ -1,3 +1,8 @@
|
||||
overrides:
|
||||
install:
|
||||
ceph:
|
||||
extra_system_packages:
|
||||
- python3-pytest
|
||||
tasks:
|
||||
- ceph:
|
||||
log-ignorelist:
|
||||
|
@ -1,3 +1,8 @@
|
||||
overrides:
|
||||
install:
|
||||
ceph:
|
||||
extra_system_packages:
|
||||
- python3-pytest
|
||||
tasks:
|
||||
- ceph:
|
||||
- ceph-fuse:
|
||||
|
@ -1,6 +1,11 @@
|
||||
meta:
|
||||
- desc: |
|
||||
librbd python api tests
|
||||
overrides:
|
||||
install:
|
||||
ceph:
|
||||
extra_system_packages:
|
||||
- python3-pytest
|
||||
workload:
|
||||
full_sequential:
|
||||
- print: "**** done start test_rbd_python.yaml"
|
||||
|
@ -1,6 +1,11 @@
|
||||
meta:
|
||||
- desc: |
|
||||
librbd python api tests
|
||||
overrides:
|
||||
install:
|
||||
ceph:
|
||||
extra_system_packages:
|
||||
- python3-pytest
|
||||
workload:
|
||||
full_sequential:
|
||||
- print: "**** done start test_rbd_python.yaml"
|
||||
|
@ -92,7 +92,7 @@ class CephTestCase(unittest.TestCase):
|
||||
|
||||
|
||||
def assert_cluster_log(self, expected_pattern, invert_match=False,
|
||||
timeout=10, watch_channel=None):
|
||||
timeout=10, watch_channel=None, present=True):
|
||||
"""
|
||||
Context manager. Assert that during execution, or up to 5 seconds later,
|
||||
the Ceph cluster log emits a message matching the expected pattern.
|
||||
@ -102,6 +102,8 @@ class CephTestCase(unittest.TestCase):
|
||||
:param watch_channel: Specifies the channel to be watched. This can be
|
||||
'cluster', 'audit', ...
|
||||
:type watch_channel: str
|
||||
:param present: Assert the log entry is present (default: True) or not (False).
|
||||
:type present: bool
|
||||
"""
|
||||
|
||||
ceph_manager = self.ceph_cluster.mon_manager
|
||||
@ -118,10 +120,13 @@ class CephTestCase(unittest.TestCase):
|
||||
self.watcher_process = ceph_manager.run_ceph_w(watch_channel)
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
fail = False
|
||||
if not self.watcher_process.finished:
|
||||
# Check if we got an early match, wait a bit if we didn't
|
||||
if self.match():
|
||||
if present and self.match():
|
||||
return
|
||||
elif not present and self.match():
|
||||
fail = True
|
||||
else:
|
||||
log.debug("No log hits yet, waiting...")
|
||||
# Default monc tick interval is 10s, so wait that long and
|
||||
@ -134,9 +139,12 @@ class CephTestCase(unittest.TestCase):
|
||||
except CommandFailedError:
|
||||
pass
|
||||
|
||||
if not self.match():
|
||||
log.error("Log output: \n{0}\n".format(self.watcher_process.stdout.getvalue()))
|
||||
raise AssertionError("Expected log message not found: '{0}'".format(expected_pattern))
|
||||
if present and not self.match():
|
||||
log.error(f"Log output: \n{self.watcher_process.stdout.getvalue()}\n")
|
||||
raise AssertionError(f"Expected log message found: '{expected_pattern}'")
|
||||
elif fail or (not present and self.match()):
|
||||
log.error(f"Log output: \n{self.watcher_process.stdout.getvalue()}\n")
|
||||
raise AssertionError(f"Unexpected log message found: '{expected_pattern}'")
|
||||
|
||||
return ContextManager()
|
||||
|
||||
|
@ -1153,6 +1153,9 @@ class Filesystem(MDSCluster):
|
||||
if timeout is None:
|
||||
timeout = DAEMON_WAIT_TIMEOUT
|
||||
|
||||
if self.id is None:
|
||||
status = self.getinfo(refresh=True)
|
||||
|
||||
if status is None:
|
||||
status = self.status()
|
||||
|
||||
|
@ -184,6 +184,10 @@ class BacktracelessFile(Workload):
|
||||
# We might not have got the name or path, but we should still get the size
|
||||
self.assert_equal(st['st_size'], self._initial_state['st_size'])
|
||||
|
||||
# remove the entry from lost+found directory
|
||||
self._mount.run_shell(["sudo", "rm", "-f", f'lost+found/{ino_name}'], omit_sudo=False)
|
||||
self.assert_equal(self._mount.ls("lost+found", sudo=True), [])
|
||||
|
||||
return self._errors
|
||||
|
||||
|
||||
|
@ -2,7 +2,7 @@ import time
|
||||
import signal
|
||||
import logging
|
||||
import operator
|
||||
from random import randint
|
||||
from random import randint, choice
|
||||
|
||||
from tasks.cephfs.cephfs_test_case import CephFSTestCase
|
||||
from teuthology.exceptions import CommandFailedError
|
||||
@ -297,6 +297,27 @@ class TestFailover(CephFSTestCase):
|
||||
CLIENTS_REQUIRED = 1
|
||||
MDSS_REQUIRED = 2
|
||||
|
||||
def test_repeated_boot(self):
|
||||
"""
|
||||
That multiple boot messages do not result in the MDS getting evicted.
|
||||
"""
|
||||
|
||||
interval = 10
|
||||
self.config_set("mon", "paxos_propose_interval", interval)
|
||||
|
||||
mds = choice(list(self.fs.status().get_all()))
|
||||
|
||||
with self.assert_cluster_log(f"daemon mds.{mds['name']} restarted", present=False):
|
||||
# Avoid a beacon to the monitors with down:dne by restarting:
|
||||
self.fs.mds_fail(mds_id=mds['name'])
|
||||
# `ceph mds fail` won't return until the FSMap is committed, double-check:
|
||||
self.assertIsNone(self.fs.status().get_mds_gid(mds['gid']))
|
||||
time.sleep(2) # for mds to restart and accept asok commands
|
||||
status1 = self.fs.mds_asok(['status'], mds_id=mds['name'])
|
||||
time.sleep(interval*1.5)
|
||||
status2 = self.fs.mds_asok(['status'], mds_id=mds['name'])
|
||||
self.assertEqual(status1['id'], status2['id'])
|
||||
|
||||
def test_simple(self):
|
||||
"""
|
||||
That when the active MDS is killed, a standby MDS is promoted into
|
||||
|
@ -572,3 +572,46 @@ class TestCacheDrop(CephFSTestCase):
|
||||
# particular operation causing this is journal flush which causes the
|
||||
# MDS to wait wait for cap revoke.
|
||||
self.mount_a.resume_netns()
|
||||
|
||||
class TestSkipReplayInoTable(CephFSTestCase):
|
||||
MDSS_REQUIRED = 1
|
||||
CLIENTS_REQUIRED = 1
|
||||
|
||||
def test_alloc_cinode_assert(self):
|
||||
"""
|
||||
Test alloc CInode assert.
|
||||
|
||||
See: https://tracker.ceph.com/issues/52280
|
||||
"""
|
||||
|
||||
# Create a directory and the mds will journal this and then crash
|
||||
self.mount_a.run_shell(["rm", "-rf", "test_alloc_ino"])
|
||||
self.mount_a.run_shell(["mkdir", "test_alloc_ino"])
|
||||
|
||||
status = self.fs.status()
|
||||
rank0 = self.fs.get_rank(rank=0, status=status)
|
||||
|
||||
self.fs.mds_asok(['config', 'set', 'mds_kill_skip_replaying_inotable', "true"])
|
||||
# This will make the MDS crash, since we only have one MDS in the
|
||||
# cluster and without the "wait=False" it will stuck here forever.
|
||||
self.mount_a.run_shell(["mkdir", "test_alloc_ino/dir1"], wait=False)
|
||||
|
||||
# sleep 10 seconds to make sure the journal logs are flushed and
|
||||
# the mds crashes
|
||||
time.sleep(10)
|
||||
|
||||
# Now set the mds config to skip replaying the inotable
|
||||
self.fs.set_ceph_conf('mds', 'mds_inject_skip_replaying_inotable', True)
|
||||
self.fs.set_ceph_conf('mds', 'mds_wipe_sessions', True)
|
||||
|
||||
self.fs.mds_restart()
|
||||
# sleep 5 seconds to make sure the mds tell command won't stuck
|
||||
time.sleep(5)
|
||||
self.fs.wait_for_daemons()
|
||||
|
||||
self.delete_mds_coredump(rank0['name']);
|
||||
|
||||
self.mount_a.run_shell(["mkdir", "test_alloc_ino/dir2"])
|
||||
|
||||
ls_out = set(self.mount_a.ls("test_alloc_ino/"))
|
||||
self.assertEqual(ls_out, set({"dir1", "dir2"}))
|
||||
|
@ -550,6 +550,16 @@ class TestMonSnapsAndFsPools(CephFSTestCase):
|
||||
with self.assertRaises(CommandFailedError):
|
||||
self.fs.rados(["mksnap", "snap2"], pool=self.fs.get_metadata_pool_name())
|
||||
|
||||
with self.assertRaises(CommandFailedError):
|
||||
test_pool_name = self.fs.get_data_pool_name()
|
||||
base_cmd = f'osd pool mksnap {test_pool_name} snap3'
|
||||
self.run_cluster_cmd(base_cmd)
|
||||
|
||||
with self.assertRaises(CommandFailedError):
|
||||
test_pool_name = self.fs.get_metadata_pool_name()
|
||||
base_cmd = f'osd pool mksnap {test_pool_name} snap4'
|
||||
self.run_cluster_cmd(base_cmd)
|
||||
|
||||
def test_attaching_pools_with_snaps_to_fs_fails(self):
|
||||
"""
|
||||
Test that attempt to attach pool with snapshots to an fs fails
|
||||
|
@ -42,9 +42,9 @@ def run_in_keystone_venv(ctx, client, args):
|
||||
run.Raw('&&')
|
||||
] + args)
|
||||
|
||||
def get_keystone_venved_cmd(ctx, cmd, args):
|
||||
def get_keystone_venved_cmd(ctx, cmd, args, env=[]):
|
||||
kbindir = get_keystone_dir(ctx) + '/.tox/venv/bin/'
|
||||
return [ kbindir + 'python', kbindir + cmd ] + args
|
||||
return env + [ kbindir + 'python', kbindir + cmd ] + args
|
||||
|
||||
@contextlib.contextmanager
|
||||
def download(ctx, config):
|
||||
@ -143,6 +143,37 @@ def install_packages(ctx, config):
|
||||
for dep in packages[client]:
|
||||
remove_package(dep, remote)
|
||||
|
||||
def run_mysql_query(ctx, remote, query):
|
||||
query_arg = '--execute="{}"'.format(query)
|
||||
args = ['sudo', 'mysql', run.Raw(query_arg)]
|
||||
remote.run(args=args)
|
||||
|
||||
@contextlib.contextmanager
|
||||
def setup_database(ctx, config):
|
||||
"""
|
||||
Setup database for Keystone.
|
||||
"""
|
||||
assert isinstance(config, dict)
|
||||
log.info('Setting up database for keystone...')
|
||||
|
||||
for (client, cconf) in config.items():
|
||||
(remote,) = ctx.cluster.only(client).remotes.keys()
|
||||
|
||||
# MariaDB on RHEL/CentOS needs service started after package install
|
||||
# while Ubuntu starts service by default.
|
||||
if remote.os.name == 'rhel' or remote.os.name == 'centos':
|
||||
remote.run(args=['sudo', 'systemctl', 'restart', 'mariadb'])
|
||||
|
||||
run_mysql_query(ctx, remote, "CREATE USER 'keystone'@'localhost' IDENTIFIED BY 'SECRET';")
|
||||
run_mysql_query(ctx, remote, "CREATE DATABASE keystone;")
|
||||
run_mysql_query(ctx, remote, "GRANT ALL PRIVILEGES ON keystone.* TO 'keystone'@'localhost';")
|
||||
run_mysql_query(ctx, remote, "FLUSH PRIVILEGES;")
|
||||
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
pass
|
||||
|
||||
@contextlib.contextmanager
|
||||
def setup_venv(ctx, config):
|
||||
"""
|
||||
@ -151,6 +182,9 @@ def setup_venv(ctx, config):
|
||||
assert isinstance(config, dict)
|
||||
log.info('Setting up virtualenv for keystone...')
|
||||
for (client, _) in config.items():
|
||||
run_in_keystone_dir(ctx, client,
|
||||
['sed', '-i', 's/usedevelop.*/usedevelop=false/g', 'tox.ini'])
|
||||
|
||||
run_in_keystone_dir(ctx, client,
|
||||
[ 'source',
|
||||
'{tvdir}/bin/activate'.format(tvdir=get_toxvenv_dir(ctx)),
|
||||
@ -173,7 +207,8 @@ def configure_instance(ctx, config):
|
||||
assert isinstance(config, dict)
|
||||
log.info('Configuring keystone...')
|
||||
|
||||
keyrepo_dir = '{kdir}/etc/fernet-keys'.format(kdir=get_keystone_dir(ctx))
|
||||
kdir = get_keystone_dir(ctx)
|
||||
keyrepo_dir = '{kdir}/etc/fernet-keys'.format(kdir=kdir)
|
||||
for (client, _) in config.items():
|
||||
# prepare the config file
|
||||
run_in_keystone_dir(ctx, client,
|
||||
@ -195,6 +230,12 @@ def configure_instance(ctx, config):
|
||||
'-e', 's^#key_repository =.*^key_repository = {kr}^'.format(kr = keyrepo_dir),
|
||||
'-i', 'etc/keystone.conf'
|
||||
])
|
||||
run_in_keystone_dir(ctx, client,
|
||||
[
|
||||
'sed',
|
||||
'-e', 's^#connection =.*^connection = mysql+pymysql://keystone:SECRET@localhost/keystone^',
|
||||
'-i', 'etc/keystone.conf'
|
||||
])
|
||||
# log to a file that gets archived
|
||||
log_file = '{p}/archive/keystone.{c}.log'.format(p=teuthology.get_testdir(ctx), c=client)
|
||||
run_in_keystone_dir(ctx, client,
|
||||
@ -209,12 +250,14 @@ def configure_instance(ctx, config):
|
||||
'{}/archive/keystone.{}.conf'.format(teuthology.get_testdir(ctx), client)
|
||||
])
|
||||
|
||||
conf_file = '{kdir}/etc/keystone.conf'.format(kdir=get_keystone_dir(ctx))
|
||||
|
||||
# prepare key repository for Fetnet token authenticator
|
||||
run_in_keystone_dir(ctx, client, [ 'mkdir', '-p', keyrepo_dir ])
|
||||
run_in_keystone_venv(ctx, client, [ 'keystone-manage', 'fernet_setup' ])
|
||||
run_in_keystone_venv(ctx, client, [ 'keystone-manage', '--config-file', conf_file, 'fernet_setup' ])
|
||||
|
||||
# sync database
|
||||
run_in_keystone_venv(ctx, client, [ 'keystone-manage', 'db_sync' ])
|
||||
run_in_keystone_venv(ctx, client, [ 'keystone-manage', '--config-file', conf_file, 'db_sync' ])
|
||||
yield
|
||||
|
||||
@contextlib.contextmanager
|
||||
@ -222,6 +265,8 @@ def run_keystone(ctx, config):
|
||||
assert isinstance(config, dict)
|
||||
log.info('Configuring keystone...')
|
||||
|
||||
conf_file = '{kdir}/etc/keystone.conf'.format(kdir=get_keystone_dir(ctx))
|
||||
|
||||
for (client, _) in config.items():
|
||||
(remote,) = ctx.cluster.only(client).remotes.keys()
|
||||
cluster_name, _, client_id = teuthology.split_role(client)
|
||||
@ -238,7 +283,10 @@ def run_keystone(ctx, config):
|
||||
# our other daemons, doesn't quit on stdin.close().
|
||||
# Teuthology relies on this behaviour.
|
||||
run.Raw('& { read; kill %1; }')
|
||||
]
|
||||
],
|
||||
[
|
||||
run.Raw('OS_KEYSTONE_CONFIG_FILES={}'.format(conf_file)),
|
||||
],
|
||||
)
|
||||
ctx.daemons.add_daemon(
|
||||
remote, 'keystone', client_public_with_id,
|
||||
@ -246,27 +294,6 @@ def run_keystone(ctx, config):
|
||||
args=run_cmd,
|
||||
logger=log.getChild(client),
|
||||
stdin=run.PIPE,
|
||||
cwd=get_keystone_dir(ctx),
|
||||
wait=False,
|
||||
check_status=False,
|
||||
)
|
||||
|
||||
# start the admin endpoint
|
||||
client_admin_with_id = 'keystone.admin' + '.' + client_id
|
||||
|
||||
admin_host, admin_port = ctx.keystone.admin_endpoints[client]
|
||||
run_cmd = get_keystone_venved_cmd(ctx, 'keystone-wsgi-admin',
|
||||
[ '--host', admin_host, '--port', str(admin_port),
|
||||
run.Raw('& { read; kill %1; }')
|
||||
]
|
||||
)
|
||||
ctx.daemons.add_daemon(
|
||||
remote, 'keystone', client_admin_with_id,
|
||||
cluster=cluster_name,
|
||||
args=run_cmd,
|
||||
logger=log.getChild(client),
|
||||
stdin=run.PIPE,
|
||||
cwd=get_keystone_dir(ctx),
|
||||
wait=False,
|
||||
check_status=False,
|
||||
)
|
||||
@ -276,10 +303,6 @@ def run_keystone(ctx, config):
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
log.info('Stopping Keystone admin instance')
|
||||
ctx.daemons.get_daemon('keystone', client_admin_with_id,
|
||||
cluster_name).stop()
|
||||
|
||||
log.info('Stopping Keystone public instance')
|
||||
ctx.daemons.get_daemon('keystone', client_public_with_id,
|
||||
cluster_name).stop()
|
||||
@ -305,7 +328,7 @@ def dict_to_args(specials, items):
|
||||
|
||||
def run_section_cmds(ctx, cclient, section_cmd, specials,
|
||||
section_config_list):
|
||||
admin_host, admin_port = ctx.keystone.admin_endpoints[cclient]
|
||||
public_host, public_port = ctx.keystone.public_endpoints[cclient]
|
||||
|
||||
auth_section = [
|
||||
( 'os-username', 'admin' ),
|
||||
@ -314,8 +337,8 @@ def run_section_cmds(ctx, cclient, section_cmd, specials,
|
||||
( 'os-project-name', 'admin' ),
|
||||
( 'os-project-domain-id', 'default' ),
|
||||
( 'os-identity-api-version', '3' ),
|
||||
( 'os-auth-url', 'http://{host}:{port}/v3'.format(host=admin_host,
|
||||
port=admin_port) ),
|
||||
( 'os-auth-url', 'http://{host}:{port}/v3'.format(host=public_host,
|
||||
port=public_port) ),
|
||||
]
|
||||
|
||||
for section_item in section_config_list:
|
||||
@ -344,28 +367,26 @@ def fill_keystone(ctx, config):
|
||||
public_host, public_port = ctx.keystone.public_endpoints[cclient]
|
||||
url = 'http://{host}:{port}/v3'.format(host=public_host,
|
||||
port=public_port)
|
||||
admin_host, admin_port = ctx.keystone.admin_endpoints[cclient]
|
||||
admin_url = 'http://{host}:{port}/v3'.format(host=admin_host,
|
||||
port=admin_port)
|
||||
opts = {'password': 'ADMIN',
|
||||
'region-id': 'RegionOne',
|
||||
'internal-url': url,
|
||||
'admin-url': admin_url,
|
||||
'admin-url': url,
|
||||
'public-url': url}
|
||||
bootstrap_args = chain.from_iterable(('--bootstrap-{}'.format(k), v)
|
||||
for k, v in opts.items())
|
||||
conf_file = '{kdir}/etc/keystone.conf'.format(kdir=get_keystone_dir(ctx))
|
||||
run_in_keystone_venv(ctx, cclient,
|
||||
['keystone-manage', 'bootstrap'] +
|
||||
['keystone-manage', '--config-file', conf_file, 'bootstrap'] +
|
||||
list(bootstrap_args))
|
||||
|
||||
# configure tenants/projects
|
||||
run_section_cmds(ctx, cclient, 'domain create', 'name',
|
||||
run_section_cmds(ctx, cclient, 'domain create --or-show', 'name',
|
||||
cconfig.get('domains', []))
|
||||
run_section_cmds(ctx, cclient, 'project create', 'name',
|
||||
run_section_cmds(ctx, cclient, 'project create --or-show', 'name',
|
||||
cconfig.get('projects', []))
|
||||
run_section_cmds(ctx, cclient, 'user create', 'name',
|
||||
run_section_cmds(ctx, cclient, 'user create --or-show', 'name',
|
||||
cconfig.get('users', []))
|
||||
run_section_cmds(ctx, cclient, 'role create', 'name',
|
||||
run_section_cmds(ctx, cclient, 'role create --or-show', 'name',
|
||||
cconfig.get('roles', []))
|
||||
run_section_cmds(ctx, cclient, 'role add', 'name',
|
||||
cconfig.get('role-mappings', []))
|
||||
@ -410,24 +431,21 @@ def task(ctx, config):
|
||||
client.0:
|
||||
force-branch: master
|
||||
domains:
|
||||
- name: default
|
||||
description: Default Domain
|
||||
- name: custom
|
||||
description: Custom domain
|
||||
projects:
|
||||
- name: admin
|
||||
description: Admin Tenant
|
||||
- name: custom
|
||||
description: Custom project
|
||||
users:
|
||||
- name: admin
|
||||
password: ADMIN
|
||||
project: admin
|
||||
roles: [ name: admin, name: Member ]
|
||||
- name: custom
|
||||
password: SECRET
|
||||
project: custom
|
||||
roles: [ name: custom ]
|
||||
role-mappings:
|
||||
- name: admin
|
||||
user: admin
|
||||
project: admin
|
||||
- name: custom
|
||||
user: custom
|
||||
project: custom
|
||||
services:
|
||||
- name: keystone
|
||||
type: identity
|
||||
description: Keystone Identity Service
|
||||
- name: swift
|
||||
type: object-store
|
||||
description: Swift Service
|
||||
@ -450,11 +468,11 @@ def task(ctx, config):
|
||||
|
||||
ctx.keystone = argparse.Namespace()
|
||||
ctx.keystone.public_endpoints = assign_ports(ctx, config, 5000)
|
||||
ctx.keystone.admin_endpoints = assign_ports(ctx, config, 35357)
|
||||
|
||||
with contextutil.nested(
|
||||
lambda: download(ctx=ctx, config=config),
|
||||
lambda: install_packages(ctx=ctx, config=config),
|
||||
lambda: setup_database(ctx=ctx, config=config),
|
||||
lambda: setup_venv(ctx=ctx, config=config),
|
||||
lambda: configure_instance(ctx=ctx, config=config),
|
||||
lambda: run_keystone(ctx=ctx, config=config),
|
||||
|
@ -179,7 +179,7 @@ def task(ctx, config):
|
||||
conf:
|
||||
client:
|
||||
rgw keystone api version: 3
|
||||
rgw keystone accepted roles: admin,Member
|
||||
rgw keystone accepted roles: admin,member
|
||||
rgw keystone implicit tenants: true
|
||||
rgw keystone accepted admin roles: admin
|
||||
rgw swift enforce content length: true
|
||||
|
@ -678,29 +678,14 @@
|
||||
}
|
||||
|
||||
# "Conditional jump or move depends on uninitialised value(s)" in OpenSSL
|
||||
# while using aes-128-gcm with AES-NI enabled. Not observed while running
|
||||
# with `OPENSSL_ia32cap="~0x200000200000000"`.
|
||||
# https://github.com/openssl/openssl/issues/19719
|
||||
{
|
||||
uninitialised gcm.Xi in aes-128-gcm with AES-NI for msgr, part 1
|
||||
Memcheck:Cond
|
||||
...
|
||||
fun:EVP_DecryptFinal_ex
|
||||
fun:_ZN4ceph6crypto6onwire25AES128GCM_OnWireRxHandler34authenticated_decrypt_update_finalEONS_6buffer7v*4listEj
|
||||
fun:_ZN10ProtocolV231handle_read_frame_epilogue_mainEOSt10unique_ptrIN4ceph6buffer7v*8ptr_nodeENS4_8disposerEEi
|
||||
fun:_ZN10ProtocolV216run_continuationER2CtIS_E
|
||||
...
|
||||
fun:_ZN15AsyncConnection7processEv
|
||||
fun:_ZN11EventCenter14process_eventsEjPNSt6chrono8durationImSt5ratioILl1ELl1000000000EEEE
|
||||
...
|
||||
}
|
||||
|
||||
{
|
||||
uninitialised gcm.Xi in aes-128-gcm with AES-NI for msgr, part 2
|
||||
Memcheck:Cond
|
||||
fun:_ZN4ceph6crypto6onwire25AES128GCM_OnWireRxHandler34authenticated_decrypt_update_finalEONS_6buffer7v*4listEj
|
||||
fun:_ZN10ProtocolV231handle_read_frame_epilogue_mainEOSt10unique_ptrIN4ceph6buffer7v*8ptr_nodeENS4_8disposerEEi
|
||||
fun:_ZN10ProtocolV216run_continuationER2CtIS_E
|
||||
...
|
||||
fun:_ZN11EventCenter14process_eventsEjPNSt6chrono8durationImSt5ratioILl1ELl1000000000EEEE
|
||||
...
|
||||
uninitialized value in gcm_cipher_internal
|
||||
Memcheck:Cond
|
||||
...
|
||||
fun:gcm_cipher_internal
|
||||
...
|
||||
fun:ossl_gcm_stream_final
|
||||
fun:EVP_DecryptFinal_ex
|
||||
...
|
||||
}
|
||||
|
@ -6,6 +6,7 @@ FIRST_DAMAGE="first-damage.py"
|
||||
FS=cephfs
|
||||
METADATA_POOL=cephfs_meta
|
||||
MOUNT=~/mnt/mnt.0
|
||||
PYTHON=python3
|
||||
|
||||
function usage {
|
||||
printf '%s: [--fs=<fs_name>] [--metadata-pool=<pool>] [--first-damage=</path/to/first-damage.py>]\n'
|
||||
@ -19,6 +20,7 @@ function create {
|
||||
DIR_INODE=$(stat -c '%i' dir)
|
||||
touch dir/a
|
||||
touch dir/"a space"
|
||||
touch -- $(printf 'dir/\xff')
|
||||
mkdir dir/.snap/1
|
||||
mkdir dir/.snap/2
|
||||
# two snaps
|
||||
@ -83,9 +85,9 @@ function recover {
|
||||
sleep 5
|
||||
cephfs-journal-tool --rank="$FS":0 event recover_dentries summary
|
||||
cephfs-journal-tool --rank="$FS":0 journal reset
|
||||
python3 $FIRST_DAMAGE --debug /tmp/debug1 --memo /tmp/memo1 "$METADATA_POOL"
|
||||
python3 $FIRST_DAMAGE --debug /tmp/debug2 --memo /tmp/memo2 --repair-nosnap "$METADATA_POOL"
|
||||
python3 $FIRST_DAMAGE --debug /tmp/debug3 --memo /tmp/memo3 --remove "$METADATA_POOL"
|
||||
"$PYTHON" $FIRST_DAMAGE --debug /tmp/debug1 --memo /tmp/memo1 "$METADATA_POOL"
|
||||
"$PYTHON" $FIRST_DAMAGE --debug /tmp/debug2 --memo /tmp/memo2 --repair-nosnap "$METADATA_POOL"
|
||||
"$PYTHON" $FIRST_DAMAGE --debug /tmp/debug3 --memo /tmp/memo3 --remove "$METADATA_POOL"
|
||||
ceph fs set "$FS" joinable true
|
||||
}
|
||||
|
||||
@ -123,7 +125,7 @@ function mount {
|
||||
}
|
||||
|
||||
function main {
|
||||
eval set -- $(getopt --name "$0" --options '' --longoptions 'help,fs:,metadata-pool:,first-damage:,mount:' -- "$@")
|
||||
eval set -- $(getopt --name "$0" --options '' --longoptions 'help,fs:,metadata-pool:,first-damage:,mount:,python:' -- "$@")
|
||||
|
||||
while [ "$#" -gt 0 ]; do
|
||||
echo "$*"
|
||||
@ -148,6 +150,10 @@ function main {
|
||||
FIRST_DAMAGE="$2"
|
||||
shift 2
|
||||
;;
|
||||
--python)
|
||||
PYTHON="$2"
|
||||
shift 2
|
||||
;;
|
||||
--)
|
||||
shift
|
||||
break
|
||||
|
@ -2,5 +2,5 @@
|
||||
|
||||
# Running as root because the filesystem root directory will be
|
||||
# owned by uid 0, and that's where we're writing.
|
||||
sudo python3 -m nose -v $(dirname $0)/../../../src/test/pybind/test_cephfs.py
|
||||
sudo python3 -m pytest -v $(dirname $0)/../../../src/test/pybind/test_cephfs.py
|
||||
exit 0
|
||||
|
@ -1,5 +1,5 @@
|
||||
#!/bin/sh -ex
|
||||
|
||||
ceph osd pool create rbd
|
||||
${PYTHON:-python3} -m nose -v $(dirname $0)/../../../src/test/pybind/test_rados.py "$@"
|
||||
${PYTHON:-python3} -m pytest -v $(dirname $0)/../../../src/test/pybind/test_rados.py "$@"
|
||||
exit 0
|
||||
|
@ -11,11 +11,9 @@ git clone https://github.com/qemu/qemu.git
|
||||
cd qemu
|
||||
|
||||
|
||||
if grep -iqE '(bionic|focal|jammy)' /etc/os-release; then
|
||||
# Bionic requires a matching test harness
|
||||
if grep -iqE '(bionic|focal|jammy|platform:el9)' /etc/os-release; then
|
||||
git checkout v2.11.0
|
||||
elif grep -iqE '(xenial|platform:el8)' /etc/os-release; then
|
||||
# Xenial requires a recent test harness
|
||||
git checkout v2.3.0
|
||||
else
|
||||
# use v2.2.0-rc3 (last released version that handles all the tests
|
||||
@ -23,21 +21,19 @@ else
|
||||
fi
|
||||
|
||||
cd tests/qemu-iotests
|
||||
mkdir bin
|
||||
# qemu-iotests expects a binary called just 'qemu' to be available
|
||||
if [ -x '/usr/bin/qemu-system-x86_64' ]
|
||||
then
|
||||
QEMU='/usr/bin/qemu-system-x86_64'
|
||||
|
||||
# Bionic (v2.11.0) tests expect all tools in current directory
|
||||
ln -s $QEMU qemu
|
||||
ln -s /usr/bin/qemu-img
|
||||
ln -s /usr/bin/qemu-io
|
||||
ln -s /usr/bin/qemu-nbd
|
||||
else
|
||||
QEMU='/usr/libexec/qemu-kvm'
|
||||
fi
|
||||
ln -s $QEMU bin/qemu
|
||||
|
||||
# Bionic (v2.11.0) tests expect all tools in current directory
|
||||
ln -s $QEMU qemu
|
||||
ln -s /usr/bin/qemu-img
|
||||
ln -s /usr/bin/qemu-io
|
||||
ln -s /usr/bin/qemu-nbd
|
||||
|
||||
# this is normally generated by configure, but has nothing but a python
|
||||
# binary definition, which we don't care about. for some reason it is
|
||||
@ -45,7 +41,7 @@ ln -s $QEMU bin/qemu
|
||||
touch common.env
|
||||
|
||||
# TEST_DIR is the pool for rbd
|
||||
TEST_DIR=rbd PATH="$PATH:$PWD/bin" ./check -rbd $testlist
|
||||
TEST_DIR=rbd ./check -rbd $testlist
|
||||
|
||||
cd ../../..
|
||||
rm -rf qemu
|
||||
|
@ -5,8 +5,8 @@ relpath=$(dirname $0)/../../../src/test/pybind
|
||||
if [ -n "${VALGRIND}" ]; then
|
||||
valgrind ${VALGRIND} --suppressions=${TESTDIR}/valgrind.supp \
|
||||
--errors-for-leak-kinds=definite --error-exitcode=1 \
|
||||
python3 -m nose -v $relpath/test_rbd.py "$@"
|
||||
python3 -m pytest -v $relpath/test_rbd.py "$@"
|
||||
else
|
||||
python3 -m nose -v $relpath/test_rbd.py "$@"
|
||||
python3 -m pytest -v $relpath/test_rbd.py "$@"
|
||||
fi
|
||||
exit 0
|
||||
|
57
ceph/qa/workunits/rgw/common.py
Executable file
57
ceph/qa/workunits/rgw/common.py
Executable file
@ -0,0 +1,57 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import errno
|
||||
import subprocess
|
||||
import logging as log
|
||||
import boto3
|
||||
import botocore.exceptions
|
||||
|
||||
log.basicConfig(format = '%(message)s', level=log.DEBUG)
|
||||
log.getLogger('botocore').setLevel(log.CRITICAL)
|
||||
log.getLogger('boto3').setLevel(log.CRITICAL)
|
||||
log.getLogger('urllib3').setLevel(log.CRITICAL)
|
||||
|
||||
def exec_cmd(cmd, wait = True, **kwargs):
|
||||
check_retcode = kwargs.pop('check_retcode', True)
|
||||
kwargs['shell'] = True
|
||||
kwargs['stdout'] = subprocess.PIPE
|
||||
proc = subprocess.Popen(cmd, **kwargs)
|
||||
log.info(proc.args)
|
||||
if wait:
|
||||
out, _ = proc.communicate()
|
||||
if check_retcode:
|
||||
assert(proc.returncode == 0)
|
||||
return out
|
||||
return (out, proc.returncode)
|
||||
return ''
|
||||
|
||||
def create_user(uid, display_name, access_key, secret_key):
|
||||
_, ret = exec_cmd(f'radosgw-admin user create --uid {uid} --display-name "{display_name}" --access-key {access_key} --secret {secret_key}', check_retcode=False)
|
||||
assert(ret == 0 or errno.EEXIST)
|
||||
|
||||
def boto_connect(access_key, secret_key, config=None):
|
||||
def try_connect(portnum, ssl, proto):
|
||||
endpoint = proto + '://localhost:' + portnum
|
||||
conn = boto3.resource('s3',
|
||||
aws_access_key_id=access_key,
|
||||
aws_secret_access_key=secret_key,
|
||||
use_ssl=ssl,
|
||||
endpoint_url=endpoint,
|
||||
verify=False,
|
||||
config=config,
|
||||
)
|
||||
try:
|
||||
list(conn.buckets.limit(1)) # just verify we can list buckets
|
||||
except botocore.exceptions.ConnectionError as e:
|
||||
print(e)
|
||||
raise
|
||||
print('connected to', endpoint)
|
||||
return conn
|
||||
try:
|
||||
return try_connect('80', False, 'http')
|
||||
except botocore.exceptions.ConnectionError:
|
||||
try: # retry on non-privileged http port
|
||||
return try_connect('8000', False, 'http')
|
||||
except botocore.exceptions.ConnectionError:
|
||||
# retry with ssl
|
||||
return try_connect('443', True, 'https')
|
19
ceph/qa/workunits/rgw/run-versioning.sh
Executable file
19
ceph/qa/workunits/rgw/run-versioning.sh
Executable file
@ -0,0 +1,19 @@
|
||||
#!/usr/bin/env bash
|
||||
set -ex
|
||||
|
||||
# assume working ceph environment (radosgw-admin in path) and rgw on localhost:80
|
||||
# localhost::443 for ssl
|
||||
|
||||
mydir=`dirname $0`
|
||||
|
||||
python3 -m venv $mydir
|
||||
source $mydir/bin/activate
|
||||
pip install pip --upgrade
|
||||
pip install boto3
|
||||
|
||||
## run test
|
||||
$mydir/bin/python3 $mydir/test_rgw_versioning.py
|
||||
|
||||
deactivate
|
||||
echo OK.
|
||||
|
@ -1,13 +1,11 @@
|
||||
#!/usr/bin/python3
|
||||
|
||||
import errno
|
||||
import logging as log
|
||||
import time
|
||||
import subprocess
|
||||
import logging as log
|
||||
import json
|
||||
import boto3
|
||||
import botocore.exceptions
|
||||
import os
|
||||
from common import exec_cmd, boto_connect, create_user
|
||||
|
||||
"""
|
||||
Rgw manual and dynamic resharding testing against a running instance
|
||||
@ -19,11 +17,6 @@ Rgw manual and dynamic resharding testing against a running instance
|
||||
#
|
||||
#
|
||||
|
||||
log.basicConfig(format = '%(message)s', level=log.DEBUG)
|
||||
log.getLogger('botocore').setLevel(log.CRITICAL)
|
||||
log.getLogger('boto3').setLevel(log.CRITICAL)
|
||||
log.getLogger('urllib3').setLevel(log.CRITICAL)
|
||||
|
||||
""" Constants """
|
||||
USER = 'tester'
|
||||
DISPLAY_NAME = 'Testing'
|
||||
@ -33,18 +26,6 @@ BUCKET_NAME = 'a-bucket'
|
||||
VER_BUCKET_NAME = 'myver'
|
||||
INDEX_POOL = 'default.rgw.buckets.index'
|
||||
|
||||
def exec_cmd(cmd, **kwargs):
|
||||
check_retcode = kwargs.pop('check_retcode', True)
|
||||
kwargs['shell'] = True
|
||||
kwargs['stdout'] = subprocess.PIPE
|
||||
proc = subprocess.Popen(cmd, **kwargs)
|
||||
log.info(proc.args)
|
||||
out, _ = proc.communicate()
|
||||
if check_retcode:
|
||||
assert(proc.returncode == 0)
|
||||
return out
|
||||
return (out, proc.returncode)
|
||||
|
||||
class BucketStats:
|
||||
def __init__(self, bucket_name, bucket_id, num_objs=0, size_kb=0, num_shards=0):
|
||||
self.bucket_name = bucket_name
|
||||
@ -163,41 +144,14 @@ def main():
|
||||
"""
|
||||
execute manual and dynamic resharding commands
|
||||
"""
|
||||
# create user
|
||||
_, ret = exec_cmd('radosgw-admin user create --uid {} --display-name {} --access-key {} --secret {}'.format(USER, DISPLAY_NAME, ACCESS_KEY, SECRET_KEY), check_retcode=False)
|
||||
assert(ret == 0 or errno.EEXIST)
|
||||
|
||||
def boto_connect(portnum, ssl, proto):
|
||||
endpoint = proto + '://localhost:' + portnum
|
||||
conn = boto3.resource('s3',
|
||||
aws_access_key_id=ACCESS_KEY,
|
||||
aws_secret_access_key=SECRET_KEY,
|
||||
use_ssl=ssl,
|
||||
endpoint_url=endpoint,
|
||||
verify=False,
|
||||
config=None,
|
||||
)
|
||||
try:
|
||||
list(conn.buckets.limit(1)) # just verify we can list buckets
|
||||
except botocore.exceptions.ConnectionError as e:
|
||||
print(e)
|
||||
raise
|
||||
print('connected to', endpoint)
|
||||
return conn
|
||||
|
||||
try:
|
||||
connection = boto_connect('80', False, 'http')
|
||||
except botocore.exceptions.ConnectionError:
|
||||
try: # retry on non-privileged http port
|
||||
connection = boto_connect('8000', False, 'http')
|
||||
except botocore.exceptions.ConnectionError:
|
||||
# retry with ssl
|
||||
connection = boto_connect('443', True, 'https')
|
||||
create_user(USER, DISPLAY_NAME, ACCESS_KEY, SECRET_KEY)
|
||||
|
||||
connection = boto_connect(ACCESS_KEY, SECRET_KEY)
|
||||
|
||||
# create a bucket
|
||||
bucket = connection.create_bucket(Bucket=BUCKET_NAME)
|
||||
ver_bucket = connection.create_bucket(Bucket=VER_BUCKET_NAME)
|
||||
connection.BucketVersioning('ver_bucket')
|
||||
connection.BucketVersioning(VER_BUCKET_NAME).enable()
|
||||
|
||||
bucket_acl = connection.BucketAcl(BUCKET_NAME).load()
|
||||
ver_bucket_acl = connection.BucketAcl(VER_BUCKET_NAME).load()
|
||||
@ -313,13 +267,23 @@ def main():
|
||||
json_op = json.loads(cmd.decode('utf-8', 'ignore')) # ignore utf-8 can't decode 0x80
|
||||
assert len(json_op) == 0
|
||||
|
||||
# TESTCASE 'check that PUT succeeds during reshard'
|
||||
log.debug(' test: PUT succeeds during reshard')
|
||||
num_shards = get_bucket_stats(VER_BUCKET_NAME).num_shards
|
||||
exec_cmd('''radosgw-admin --inject-delay-at=do_reshard --inject-delay-ms=5000 \
|
||||
bucket reshard --bucket {} --num-shards {}'''
|
||||
.format(VER_BUCKET_NAME, num_shards + 1), wait = False)
|
||||
time.sleep(1)
|
||||
ver_bucket.put_object(Key='put_during_reshard', Body=b"some_data")
|
||||
log.debug('put object successful')
|
||||
|
||||
# Clean up
|
||||
log.debug("Deleting bucket {}".format(BUCKET_NAME))
|
||||
bucket.objects.all().delete()
|
||||
bucket.delete()
|
||||
log.debug("Deleting bucket {}".format(VER_BUCKET_NAME))
|
||||
ver_bucket.object_versions.all().delete()
|
||||
ver_bucket.delete()
|
||||
|
||||
|
||||
main()
|
||||
log.info("Completed resharding tests")
|
||||
|
110
ceph/qa/workunits/rgw/test_rgw_versioning.py
Executable file
110
ceph/qa/workunits/rgw/test_rgw_versioning.py
Executable file
@ -0,0 +1,110 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import logging as log
|
||||
import json
|
||||
import uuid
|
||||
import botocore
|
||||
import time
|
||||
from common import exec_cmd, create_user, boto_connect
|
||||
from botocore.config import Config
|
||||
|
||||
"""
|
||||
Tests behavior of bucket versioning.
|
||||
"""
|
||||
# The test cases in this file have been annotated for inventory.
|
||||
# To extract the inventory (in csv format) use the command:
|
||||
#
|
||||
# grep '^ *# TESTCASE' | sed 's/^ *# TESTCASE //'
|
||||
#
|
||||
#
|
||||
|
||||
""" Constants """
|
||||
USER = 'versioning-tester'
|
||||
DISPLAY_NAME = 'Versioning Testing'
|
||||
ACCESS_KEY = 'LTA662PVVDTDWX6M2AB0'
|
||||
SECRET_KEY = 'pvtchqajgzqx5581t6qbddbkj0bgf3a69qdkjcea'
|
||||
BUCKET_NAME = 'versioning-bucket'
|
||||
DATA_POOL = 'default.rgw.buckets.data'
|
||||
|
||||
def main():
|
||||
"""
|
||||
execute versioning tests
|
||||
"""
|
||||
create_user(USER, DISPLAY_NAME, ACCESS_KEY, SECRET_KEY)
|
||||
|
||||
connection = boto_connect(ACCESS_KEY, SECRET_KEY, Config(retries = {
|
||||
'total_max_attempts': 1,
|
||||
}))
|
||||
|
||||
# pre-test cleanup
|
||||
try:
|
||||
bucket = connection.Bucket(BUCKET_NAME)
|
||||
bucket.objects.all().delete()
|
||||
bucket.object_versions.all().delete()
|
||||
bucket.delete()
|
||||
except botocore.exceptions.ClientError as e:
|
||||
if not e.response['Error']['Code'] == 'NoSuchBucket':
|
||||
raise
|
||||
|
||||
bucket = connection.create_bucket(Bucket=BUCKET_NAME)
|
||||
connection.BucketVersioning(BUCKET_NAME).enable()
|
||||
|
||||
# reproducer for bug from https://tracker.ceph.com/issues/59663
|
||||
# TESTCASE 'verify that index entries and OLH objects are cleaned up after redundant deletes'
|
||||
log.debug('TEST: verify that index entries and OLH objects are cleaned up after redundant deletes\n')
|
||||
key = str(uuid.uuid4())
|
||||
resp = bucket.Object(key).delete()
|
||||
assert 'DeleteMarker' in resp, 'DeleteMarker key not present in response'
|
||||
assert resp['DeleteMarker'], 'DeleteMarker value not True in response'
|
||||
assert 'VersionId' in resp, 'VersionId key not present in response'
|
||||
version_id = resp['VersionId']
|
||||
bucket.Object(key).delete()
|
||||
connection.ObjectVersion(bucket.name, key, version_id).delete()
|
||||
# bucket index should now be empty
|
||||
out = exec_cmd(f'radosgw-admin bi list --bucket {BUCKET_NAME}')
|
||||
json_out = json.loads(out.replace(b'\x80', b'0x80'))
|
||||
assert len(json_out) == 0, 'bucket index was not empty after all objects were deleted'
|
||||
|
||||
(_out, ret) = exec_cmd(f'rados -p {DATA_POOL} ls | grep {key}', check_retcode=False)
|
||||
assert ret != 0, 'olh object was not cleaned up'
|
||||
|
||||
# TESTCASE 'verify that index entries and OLH objects are cleaned up after index linking error'
|
||||
log.debug('TEST: verify that index entries and OLH objects are cleaned up after index linking error\n')
|
||||
key = str(uuid.uuid4())
|
||||
try:
|
||||
exec_cmd('ceph config set client rgw_debug_inject_set_olh_err 2')
|
||||
time.sleep(1)
|
||||
bucket.Object(key).delete()
|
||||
finally:
|
||||
exec_cmd('ceph config rm client rgw_debug_inject_set_olh_err')
|
||||
out = exec_cmd(f'radosgw-admin bi list --bucket {BUCKET_NAME}')
|
||||
json_out = json.loads(out.replace(b'\x80', b'0x80'))
|
||||
assert len(json_out) == 0, 'bucket index was not empty after op failed'
|
||||
(_out, ret) = exec_cmd(f'rados -p {DATA_POOL} ls | grep {key}', check_retcode=False)
|
||||
assert ret != 0, 'olh object was not cleaned up'
|
||||
|
||||
# TESTCASE 'verify that original null object version is intact after failed olh upgrade'
|
||||
log.debug('TEST: verify that original null object version is intact after failed olh upgrade\n')
|
||||
connection.BucketVersioning(BUCKET_NAME).suspend()
|
||||
key = str(uuid.uuid4())
|
||||
put_resp = bucket.put_object(Key=key, Body=b"data")
|
||||
connection.BucketVersioning(BUCKET_NAME).enable()
|
||||
try:
|
||||
exec_cmd('ceph config set client rgw_debug_inject_set_olh_err 2')
|
||||
time.sleep(1)
|
||||
# expected to fail due to the above error injection
|
||||
bucket.put_object(Key=key, Body=b"new data")
|
||||
except Exception as e:
|
||||
log.debug(e)
|
||||
finally:
|
||||
exec_cmd('ceph config rm client rgw_debug_inject_set_olh_err')
|
||||
get_resp = bucket.Object(key).get()
|
||||
assert put_resp.e_tag == get_resp['ETag'], 'get did not return null version with correct etag'
|
||||
|
||||
# Clean up
|
||||
log.debug("Deleting bucket {}".format(BUCKET_NAME))
|
||||
bucket.object_versions.all().delete()
|
||||
bucket.delete()
|
||||
|
||||
main()
|
||||
log.info("Completed bucket versioning tests")
|
@ -1,2 +1,2 @@
|
||||
a5c951305c2409669162c235d81981bdc60dd9e7
|
||||
18.1.2
|
||||
5dd24139a1eada541a3bc16b6941c5dde975e26d
|
||||
18.2.0
|
||||
|
@ -1018,7 +1018,6 @@ def create_lv(name_prefix,
|
||||
# be so this function will set it after creation using the mapping
|
||||
# XXX add CEPH_VOLUME_LVM_DEBUG to enable -vvvv on lv operations
|
||||
type_path_tag = {
|
||||
'journal': 'ceph.journal_device',
|
||||
'data': 'ceph.data_device',
|
||||
'block': 'ceph.block_device',
|
||||
'wal': 'ceph.wal_device',
|
||||
|
@ -15,86 +15,6 @@ from .listing import direct_report
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def activate_filestore(osd_lvs, no_systemd=False):
|
||||
# find the osd
|
||||
for osd_lv in osd_lvs:
|
||||
if osd_lv.tags.get('ceph.type') == 'data':
|
||||
data_lv = osd_lv
|
||||
break
|
||||
else:
|
||||
raise RuntimeError('Unable to find a data LV for filestore activation')
|
||||
|
||||
is_encrypted = data_lv.tags.get('ceph.encrypted', '0') == '1'
|
||||
is_vdo = data_lv.tags.get('ceph.vdo', '0')
|
||||
|
||||
osd_id = data_lv.tags['ceph.osd_id']
|
||||
configuration.load_ceph_conf_path(data_lv.tags['ceph.cluster_name'])
|
||||
configuration.load()
|
||||
# it may have a volume with a journal
|
||||
for osd_lv in osd_lvs:
|
||||
if osd_lv.tags.get('ceph.type') == 'journal':
|
||||
osd_journal_lv = osd_lv
|
||||
break
|
||||
else:
|
||||
osd_journal_lv = None
|
||||
|
||||
# TODO: add sensible error reporting if this is ever the case
|
||||
# blow up with a KeyError if this doesn't exist
|
||||
osd_fsid = data_lv.tags['ceph.osd_fsid']
|
||||
if not osd_journal_lv:
|
||||
# must be a disk partition, by querying blkid by the uuid we are ensuring that the
|
||||
# device path is always correct
|
||||
journal_uuid = data_lv.tags['ceph.journal_uuid']
|
||||
osd_journal = disk.get_device_from_partuuid(journal_uuid)
|
||||
else:
|
||||
journal_uuid = osd_journal_lv.lv_uuid
|
||||
osd_journal = data_lv.tags['ceph.journal_device']
|
||||
|
||||
if not osd_journal:
|
||||
raise RuntimeError('unable to detect an lv or device journal for OSD %s' % osd_id)
|
||||
|
||||
# this is done here, so that previous checks that ensure path availability
|
||||
# and correctness can still be enforced, and report if any issues are found
|
||||
if is_encrypted:
|
||||
lockbox_secret = data_lv.tags['ceph.cephx_lockbox_secret']
|
||||
# this keyring writing is idempotent
|
||||
encryption_utils.write_lockbox_keyring(osd_id, osd_fsid, lockbox_secret)
|
||||
dmcrypt_secret = encryption_utils.get_dmcrypt_key(osd_id, osd_fsid)
|
||||
encryption_utils.luks_open(dmcrypt_secret, data_lv.lv_path, data_lv.lv_uuid)
|
||||
encryption_utils.luks_open(dmcrypt_secret, osd_journal, journal_uuid)
|
||||
|
||||
osd_journal = '/dev/mapper/%s' % journal_uuid
|
||||
source = '/dev/mapper/%s' % data_lv.lv_uuid
|
||||
else:
|
||||
source = data_lv.lv_path
|
||||
|
||||
# mount the osd
|
||||
destination = '/var/lib/ceph/osd/%s-%s' % (conf.cluster, osd_id)
|
||||
if not system.device_is_mounted(source, destination=destination):
|
||||
prepare_utils.mount_osd(source, osd_id, is_vdo=is_vdo)
|
||||
|
||||
# ensure that the OSD destination is always chowned properly
|
||||
system.chown(destination)
|
||||
|
||||
# always re-do the symlink regardless if it exists, so that the journal
|
||||
# device path that may have changed can be mapped correctly every time
|
||||
destination = '/var/lib/ceph/osd/%s-%s/journal' % (conf.cluster, osd_id)
|
||||
process.run(['ln', '-snf', osd_journal, destination])
|
||||
|
||||
# make sure that the journal has proper permissions
|
||||
system.chown(osd_journal)
|
||||
|
||||
if no_systemd is False:
|
||||
# enable the ceph-volume unit for this OSD
|
||||
systemctl.enable_volume(osd_id, osd_fsid, 'lvm')
|
||||
|
||||
# enable the OSD
|
||||
systemctl.enable_osd(osd_id)
|
||||
|
||||
# start the OSD
|
||||
systemctl.start_osd(osd_id)
|
||||
terminal.success("ceph-volume lvm activate successful for osd ID: %s" % osd_id)
|
||||
|
||||
|
||||
def get_osd_device_path(osd_lvs, device_type, dmcrypt_secret=None):
|
||||
"""
|
||||
@ -279,30 +199,16 @@ class Activate(object):
|
||||
|
||||
# This argument is only available when passed in directly or via
|
||||
# systemd, not when ``create`` is being used
|
||||
# placeholder when a new objectstore support will be added
|
||||
if getattr(args, 'auto_detect_objectstore', False):
|
||||
logger.info('auto detecting objectstore')
|
||||
# may get multiple lvs, so can't do get_the_lvs() calls here
|
||||
for lv in lvs:
|
||||
has_journal = lv.tags.get('ceph.journal_uuid')
|
||||
if has_journal:
|
||||
logger.info('found a journal associated with the OSD, '
|
||||
'assuming filestore')
|
||||
return activate_filestore(lvs, args.no_systemd)
|
||||
|
||||
logger.info('unable to find a journal associated with the OSD, '
|
||||
'assuming bluestore')
|
||||
|
||||
return activate_bluestore(lvs, args.no_systemd)
|
||||
|
||||
# explicit filestore/bluestore flags take precedence
|
||||
# explicit 'objectstore' flags take precedence
|
||||
if getattr(args, 'bluestore', False):
|
||||
activate_bluestore(lvs, args.no_systemd, getattr(args, 'no_tmpfs', False))
|
||||
elif getattr(args, 'filestore', False):
|
||||
activate_filestore(lvs, args.no_systemd)
|
||||
elif any('ceph.block_device' in lv.tags for lv in lvs):
|
||||
activate_bluestore(lvs, args.no_systemd, getattr(args, 'no_tmpfs', False))
|
||||
elif any('ceph.data_device' in lv.tags for lv in lvs):
|
||||
activate_filestore(lvs, args.no_systemd)
|
||||
|
||||
def main(self):
|
||||
sub_command_help = dedent("""
|
||||
@ -348,11 +254,6 @@ class Activate(object):
|
||||
action='store_true',
|
||||
help='force bluestore objectstore activation',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--filestore',
|
||||
action='store_true',
|
||||
help='force filestore objectstore activation',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--all',
|
||||
dest='activate_all',
|
||||
|
@ -29,11 +29,10 @@ def device_formatter(devices):
|
||||
return ''.join(lines)
|
||||
|
||||
|
||||
def ensure_disjoint_device_lists(data, db=[], wal=[], journal=[]):
|
||||
def ensure_disjoint_device_lists(data, db=[], wal=[]):
|
||||
# check that all device lists are disjoint with each other
|
||||
if not all([set(data).isdisjoint(set(db)),
|
||||
set(data).isdisjoint(set(wal)),
|
||||
set(data).isdisjoint(set(journal)),
|
||||
set(db).isdisjoint(set(wal))]):
|
||||
raise Exception('Device lists are not disjoint')
|
||||
|
||||
@ -171,7 +170,7 @@ def group_devices_by_vg(devices):
|
||||
def get_lvm_fast_allocs(lvs):
|
||||
return [("{}/{}".format(d.vg_name, d.lv_name), 100.0,
|
||||
disk.Size(b=int(d.lvs[0].lv_size)), 1) for d in lvs if not
|
||||
d.used_by_ceph]
|
||||
d.journal_used_by_ceph]
|
||||
|
||||
|
||||
class Batch(object):
|
||||
@ -220,13 +219,6 @@ class Batch(object):
|
||||
default=[],
|
||||
help='Devices to provision OSDs wal volumes',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--journal-devices',
|
||||
nargs='*',
|
||||
type=arg_validators.ValidBatchDevice(),
|
||||
default=[],
|
||||
help='Devices to provision OSDs journal volumes',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--auto',
|
||||
action='store_true',
|
||||
@ -246,11 +238,6 @@ class Batch(object):
|
||||
action='store_true',
|
||||
help='bluestore objectstore (default)',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--filestore',
|
||||
action='store_true',
|
||||
help='filestore objectstore',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--report',
|
||||
action='store_true',
|
||||
@ -323,25 +310,6 @@ class Batch(object):
|
||||
type=int,
|
||||
help='Provision slots on WAL device, can remain unoccupied'
|
||||
)
|
||||
def journal_size_in_mb_hack(size):
|
||||
# TODO give user time to adjust, then remove this
|
||||
if size and size[-1].isdigit():
|
||||
mlogger.warning('DEPRECATION NOTICE')
|
||||
mlogger.warning('--journal-size as integer is parsed as megabytes')
|
||||
mlogger.warning('A future release will parse integers as bytes')
|
||||
mlogger.warning('Add a "M" to explicitly pass a megabyte size')
|
||||
size += 'M'
|
||||
return disk.Size.parse(size)
|
||||
parser.add_argument(
|
||||
'--journal-size',
|
||||
type=journal_size_in_mb_hack,
|
||||
help='Override the "osd_journal_size" value, in megabytes'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--journal-slots',
|
||||
type=int,
|
||||
help='Provision slots on journal device, can remain unoccupied'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--prepare',
|
||||
action='store_true',
|
||||
@ -356,7 +324,7 @@ class Batch(object):
|
||||
)
|
||||
self.args = parser.parse_args(argv)
|
||||
self.parser = parser
|
||||
for dev_list in ['', 'db_', 'wal_', 'journal_']:
|
||||
for dev_list in ['', 'db_', 'wal_']:
|
||||
setattr(self, '{}usable'.format(dev_list), [])
|
||||
|
||||
def report(self, plan):
|
||||
@ -395,7 +363,7 @@ class Batch(object):
|
||||
'''
|
||||
Helper for legacy auto behaviour.
|
||||
Sorts drives into rotating and non-rotating, the latter being used for
|
||||
db or journal.
|
||||
db.
|
||||
'''
|
||||
mlogger.warning('DEPRECATION NOTICE')
|
||||
mlogger.warning('You are using the legacy automatic disk sorting behavior')
|
||||
@ -408,10 +376,7 @@ class Batch(object):
|
||||
# no need for additional sorting, we'll only deploy standalone on ssds
|
||||
return
|
||||
self.args.devices = rotating
|
||||
if self.args.filestore:
|
||||
self.args.journal_devices = ssd
|
||||
else:
|
||||
self.args.db_devices = ssd
|
||||
self.args.db_devices = ssd
|
||||
|
||||
@decorators.needs_root
|
||||
def main(self):
|
||||
@ -420,19 +385,18 @@ class Batch(object):
|
||||
|
||||
# Default to bluestore here since defaulting it in add_argument may
|
||||
# cause both to be True
|
||||
if not self.args.bluestore and not self.args.filestore:
|
||||
if not self.args.bluestore:
|
||||
self.args.bluestore = True
|
||||
|
||||
if (self.args.auto and not self.args.db_devices and not
|
||||
self.args.wal_devices and not self.args.journal_devices):
|
||||
self.args.wal_devices):
|
||||
self._sort_rotational_disks()
|
||||
|
||||
self._check_slot_args()
|
||||
|
||||
ensure_disjoint_device_lists(self.args.devices,
|
||||
self.args.db_devices,
|
||||
self.args.wal_devices,
|
||||
self.args.journal_devices)
|
||||
self.args.wal_devices)
|
||||
|
||||
plan = self.get_plan(self.args)
|
||||
|
||||
@ -453,7 +417,6 @@ class Batch(object):
|
||||
defaults = common.get_default_args()
|
||||
global_args = [
|
||||
'bluestore',
|
||||
'filestore',
|
||||
'dmcrypt',
|
||||
'crush_device_class',
|
||||
'no_systemd',
|
||||
@ -473,8 +436,6 @@ class Batch(object):
|
||||
if args.bluestore:
|
||||
plan = self.get_deployment_layout(args, args.devices, args.db_devices,
|
||||
args.wal_devices)
|
||||
elif args.filestore:
|
||||
plan = self.get_deployment_layout(args, args.devices, args.journal_devices)
|
||||
return plan
|
||||
|
||||
def get_deployment_layout(self, args, devices, fast_devices=[],
|
||||
@ -500,7 +461,8 @@ class Batch(object):
|
||||
return plan
|
||||
requested_osds = args.osds_per_device * len(phys_devs) + len(lvm_devs)
|
||||
|
||||
fast_type = 'block_db' if args.bluestore else 'journal'
|
||||
if args.bluestore:
|
||||
fast_type = 'block_db'
|
||||
fast_allocations = self.fast_allocations(fast_devices,
|
||||
requested_osds,
|
||||
num_osds,
|
||||
|
@ -126,33 +126,12 @@ bluestore_args = {
|
||||
},
|
||||
}
|
||||
|
||||
filestore_args = {
|
||||
'--filestore': {
|
||||
'action': 'store_true',
|
||||
'help': 'Use the filestore objectstore',
|
||||
},
|
||||
'--journal': {
|
||||
'help': 'A logical volume (vg_name/lv_name), or path to a device',
|
||||
'type': arg_validators.ValidDevice(as_string=True),
|
||||
},
|
||||
'--journal-size': {
|
||||
'help': 'Size of journal LV in case a raw block device was passed in --journal',
|
||||
'default': '0',
|
||||
'type': disk.Size.parse
|
||||
},
|
||||
'--journal-slots': {
|
||||
'help': ('Intended number of slots on journal device. The new OSD gets one'
|
||||
'of those slots or 1/nth of the available capacity'),
|
||||
'type': int,
|
||||
'default': 1,
|
||||
},
|
||||
}
|
||||
|
||||
def get_default_args():
|
||||
defaults = {}
|
||||
def format_name(name):
|
||||
return name.strip('-').replace('-', '_').replace('.', '_')
|
||||
for argset in (common_args, filestore_args, bluestore_args):
|
||||
for argset in (common_args, bluestore_args):
|
||||
defaults.update({format_name(name): val.get('default', None) for name, val in argset.items()})
|
||||
return defaults
|
||||
|
||||
@ -168,7 +147,6 @@ def common_parser(prog, description):
|
||||
description=description,
|
||||
)
|
||||
|
||||
filestore_group = parser.add_argument_group('filestore')
|
||||
bluestore_group = parser.add_argument_group('bluestore')
|
||||
|
||||
for name, kwargs in common_args.items():
|
||||
@ -177,9 +155,6 @@ def common_parser(prog, description):
|
||||
for name, kwargs in bluestore_args.items():
|
||||
bluestore_group.add_argument(name, **kwargs)
|
||||
|
||||
for name, kwargs in filestore_args.items():
|
||||
filestore_group.add_argument(name, **kwargs)
|
||||
|
||||
# Do not parse args, so that consumers can do something before the args get
|
||||
# parsed triggering argparse behavior
|
||||
return parser
|
||||
|
@ -68,10 +68,10 @@ class Create(object):
|
||||
if len(self.argv) == 0:
|
||||
print(sub_command_help)
|
||||
return
|
||||
exclude_group_options(parser, groups=['filestore', 'bluestore'], argv=self.argv)
|
||||
exclude_group_options(parser, groups=['bluestore'], argv=self.argv)
|
||||
args = parser.parse_args(self.argv)
|
||||
# Default to bluestore here since defaulting it in add_argument may
|
||||
# cause both to be True
|
||||
if not args.bluestore and not args.filestore:
|
||||
if not args.bluestore:
|
||||
args.bluestore = True
|
||||
self.create(args)
|
||||
|
@ -17,7 +17,7 @@ logger = logging.getLogger(__name__)
|
||||
def prepare_dmcrypt(key, device, device_type, tags):
|
||||
"""
|
||||
Helper for devices that are encrypted. The operations needed for
|
||||
block, db, wal, or data/journal devices are all the same
|
||||
block, db, wal devices are all the same
|
||||
"""
|
||||
if not device:
|
||||
return ''
|
||||
@ -37,50 +37,6 @@ def prepare_dmcrypt(key, device, device_type, tags):
|
||||
return '/dev/mapper/%s' % uuid
|
||||
|
||||
|
||||
def prepare_filestore(device, journal, secrets, tags, osd_id, fsid):
|
||||
"""
|
||||
:param device: The name of the logical volume to work with
|
||||
:param journal: similar to device but can also be a regular/plain disk
|
||||
:param secrets: A dict with the secrets needed to create the osd (e.g. cephx)
|
||||
:param id_: The OSD id
|
||||
:param fsid: The OSD fsid, also known as the OSD UUID
|
||||
"""
|
||||
cephx_secret = secrets.get('cephx_secret', prepare_utils.create_key())
|
||||
|
||||
# encryption-only operations
|
||||
if secrets.get('dmcrypt_key'):
|
||||
# format and open ('decrypt' devices) and re-assign the device and journal
|
||||
# variables so that the rest of the process can use the mapper paths
|
||||
key = secrets['dmcrypt_key']
|
||||
device = prepare_dmcrypt(key, device, 'data', tags)
|
||||
journal = prepare_dmcrypt(key, journal, 'journal', tags)
|
||||
|
||||
# vdo detection
|
||||
is_vdo = api.is_vdo(device)
|
||||
# create the directory
|
||||
prepare_utils.create_osd_path(osd_id)
|
||||
# format the device
|
||||
prepare_utils.format_device(device)
|
||||
# mount the data device
|
||||
prepare_utils.mount_osd(device, osd_id, is_vdo=is_vdo)
|
||||
# symlink the journal
|
||||
prepare_utils.link_journal(journal, osd_id)
|
||||
# get the latest monmap
|
||||
prepare_utils.get_monmap(osd_id)
|
||||
# prepare the osd filesystem
|
||||
prepare_utils.osd_mkfs_filestore(osd_id, fsid, cephx_secret)
|
||||
# write the OSD keyring if it doesn't exist already
|
||||
prepare_utils.write_keyring(osd_id, cephx_secret)
|
||||
if secrets.get('dmcrypt_key'):
|
||||
# if the device is going to get activated right away, this can be done
|
||||
# here, otherwise it will be recreated
|
||||
encryption_utils.write_lockbox_keyring(
|
||||
osd_id,
|
||||
fsid,
|
||||
tags['ceph.cephx_lockbox_secret']
|
||||
)
|
||||
|
||||
|
||||
def prepare_bluestore(block, wal, db, secrets, tags, osd_id, fsid):
|
||||
"""
|
||||
:param block: The name of the logical volume for the bluestore data
|
||||
@ -201,7 +157,7 @@ class Prepare(object):
|
||||
a device or partition will result in error.
|
||||
|
||||
:param arg: The value of ``--data`` when parsing args
|
||||
:param device_type: Usually, either ``data`` or ``block`` (filestore vs. bluestore)
|
||||
:param device_type: Usually ``block``
|
||||
:param osd_uuid: The OSD uuid
|
||||
"""
|
||||
device = self.args.data
|
||||
@ -298,60 +254,7 @@ class Prepare(object):
|
||||
'ceph.crush_device_class': crush_device_class,
|
||||
'ceph.osdspec_affinity': prepare_utils.get_osdspec_affinity()
|
||||
}
|
||||
if self.args.filestore:
|
||||
if not self.args.journal:
|
||||
logger.info(('no journal was specifed, creating journal lv '
|
||||
'on {}').format(self.args.data))
|
||||
self.args.journal = self.args.data
|
||||
self.args.journal_size = disk.Size(g=5)
|
||||
# need to adjust data size/slots for colocated journal
|
||||
if self.args.data_size:
|
||||
self.args.data_size -= self.args.journal_size
|
||||
if self.args.data_slots == 1:
|
||||
self.args.data_slots = 0
|
||||
else:
|
||||
raise RuntimeError('Can\'t handle multiple filestore OSDs '
|
||||
'with colocated journals yet. Please '
|
||||
'create journal LVs manually')
|
||||
tags['ceph.cephx_lockbox_secret'] = cephx_lockbox_secret
|
||||
tags['ceph.encrypted'] = encrypted
|
||||
|
||||
journal_device, journal_uuid, tags = self.setup_device(
|
||||
'journal',
|
||||
self.args.journal,
|
||||
tags,
|
||||
self.args.journal_size,
|
||||
self.args.journal_slots)
|
||||
|
||||
try:
|
||||
vg_name, lv_name = self.args.data.split('/')
|
||||
data_lv = api.get_single_lv(filters={'lv_name': lv_name,
|
||||
'vg_name': vg_name})
|
||||
except ValueError:
|
||||
data_lv = None
|
||||
|
||||
if not data_lv:
|
||||
data_lv = self.prepare_data_device('data', osd_fsid)
|
||||
|
||||
tags['ceph.data_device'] = data_lv.lv_path
|
||||
tags['ceph.data_uuid'] = data_lv.lv_uuid
|
||||
tags['ceph.vdo'] = api.is_vdo(data_lv.lv_path)
|
||||
tags['ceph.type'] = 'data'
|
||||
data_lv.set_tags(tags)
|
||||
if not journal_device.startswith('/'):
|
||||
# we got a journal lv, set rest of the tags
|
||||
api.get_single_lv(filters={'lv_name': lv_name,
|
||||
'vg_name': vg_name}).set_tags(tags)
|
||||
|
||||
prepare_filestore(
|
||||
data_lv.lv_path,
|
||||
journal_device,
|
||||
secrets,
|
||||
tags,
|
||||
self.osd_id,
|
||||
osd_fsid,
|
||||
)
|
||||
elif self.args.bluestore:
|
||||
if self.args.bluestore:
|
||||
try:
|
||||
vg_name, lv_name = self.args.data.split('/')
|
||||
block_lv = api.get_single_lv(filters={'lv_name': lv_name,
|
||||
@ -427,15 +330,10 @@ class Prepare(object):
|
||||
if len(self.argv) == 0:
|
||||
print(sub_command_help)
|
||||
return
|
||||
exclude_group_options(parser, argv=self.argv, groups=['filestore', 'bluestore'])
|
||||
exclude_group_options(parser, argv=self.argv, groups=['bluestore'])
|
||||
self.args = parser.parse_args(self.argv)
|
||||
# the unfortunate mix of one superset for both filestore and bluestore
|
||||
# makes this validation cumbersome
|
||||
if self.args.filestore:
|
||||
if not self.args.journal:
|
||||
raise SystemExit('--journal is required when using --filestore')
|
||||
# Default to bluestore here since defaulting it in add_argument may
|
||||
# cause both to be True
|
||||
if not self.args.bluestore and not self.args.filestore:
|
||||
if not self.args.bluestore:
|
||||
self.args.bluestore = True
|
||||
self.safe_prepare()
|
||||
|
@ -101,10 +101,9 @@ def ensure_associated_lvs(lvs, lv_tags={}):
|
||||
# leaving many journals with osd.1 - usually, only a single LV will be
|
||||
# returned
|
||||
|
||||
journal_lvs = api.get_lvs(tags=merge_dict(lv_tags, {'ceph.type': 'journal'}))
|
||||
db_lvs = api.get_lvs(tags=merge_dict(lv_tags, {'ceph.type': 'db'}))
|
||||
wal_lvs = api.get_lvs(tags=merge_dict(lv_tags, {'ceph.type': 'wal'}))
|
||||
backing_devices = [(journal_lvs, 'journal'), (db_lvs, 'db'),
|
||||
backing_devices = [(db_lvs, 'db'),
|
||||
(wal_lvs, 'wal')]
|
||||
|
||||
verified_devices = []
|
||||
|
@ -16,7 +16,7 @@ logger = logging.getLogger(__name__)
|
||||
def prepare_dmcrypt(key, device, device_type, fsid):
|
||||
"""
|
||||
Helper for devices that are encrypted. The operations needed for
|
||||
block, db, wal, or data/journal devices are all the same
|
||||
block, db, wal, devices are all the same
|
||||
"""
|
||||
if not device:
|
||||
return ''
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user