mirror of
https://git.proxmox.com/git/ceph.git
synced 2025-04-28 12:54:34 +00:00
import ceph pacific 16.2.15 source
Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
This commit is contained in:
parent
ca55da0300
commit
47fdce5df8
@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.10.2)
|
||||
# remove cmake/modules/FindPython* once 3.12 is required
|
||||
|
||||
project(ceph
|
||||
VERSION 16.2.14
|
||||
VERSION 16.2.15
|
||||
LANGUAGES CXX C ASM)
|
||||
|
||||
foreach(policy
|
||||
|
@ -32,6 +32,29 @@
|
||||
in certain recovery scenarios, e.g., monitor database lost and rebuilt, and
|
||||
the restored file system is expected to have the same ID as before.
|
||||
|
||||
>=16.2.15
|
||||
----------
|
||||
* `ceph config dump --format <json|xml>` output will display the localized
|
||||
option names instead of its normalized version. For e.g.,
|
||||
"mgr/prometheus/x/server_port" will be displayed instead of
|
||||
"mgr/prometheus/server_port". This matches the output of the non pretty-print
|
||||
formatted version of the command.
|
||||
|
||||
* CEPHFS: MDS evicts clients which are not advancing their request tids which causes
|
||||
a large buildup of session metadata resulting in the MDS going read-only due to
|
||||
the RADOS operation exceeding the size threshold. `mds_session_metadata_threshold`
|
||||
config controls the maximum size that a (encoded) session metadata can grow.
|
||||
|
||||
* RADOS: `get_pool_is_selfmanaged_snaps_mode` C++ API has been deprecated
|
||||
due to being prone to false negative results. It's safer replacement is
|
||||
`pool_is_in_selfmanaged_snaps_mode`.
|
||||
|
||||
* RBD: When diffing against the beginning of time (`fromsnapname == NULL`) in
|
||||
fast-diff mode (`whole_object == true` with `fast-diff` image feature enabled
|
||||
and valid), diff-iterate is now guaranteed to execute locally if exclusive
|
||||
lock is available. This brings a dramatic performance improvement for QEMU
|
||||
live disk synchronization and backup use cases.
|
||||
|
||||
>= 16.2.14
|
||||
----------
|
||||
|
||||
@ -132,6 +155,10 @@
|
||||
* CEPHFS: After recovering a Ceph File System post following the disaster recovery
|
||||
procedure, the recovered files under `lost+found` directory can now be deleted.
|
||||
* core: cache-tiering is now deprecated.
|
||||
* mgr/snap_schedule: The snap-schedule mgr module now retains one less snapshot
|
||||
than the number mentioned against the config tunable `mds_max_snaps_per_dir`
|
||||
so that a new snapshot can be created and retained during the next schedule
|
||||
run.
|
||||
|
||||
>=16.2.8
|
||||
--------
|
||||
|
@ -1,4 +1,4 @@
|
||||
Sphinx == 4.4.0
|
||||
Sphinx == 5.0.2
|
||||
git+https://github.com/ceph/sphinx-ditaa.git@py3#egg=sphinx-ditaa
|
||||
breathe >= 4.20.0
|
||||
Jinja2
|
||||
|
@ -135,7 +135,7 @@
|
||||
# main package definition
|
||||
#################################################################################
|
||||
Name: ceph
|
||||
Version: 16.2.14
|
||||
Version: 16.2.15
|
||||
Release: 0%{?dist}
|
||||
%if 0%{?fedora} || 0%{?rhel}
|
||||
Epoch: 2
|
||||
@ -151,7 +151,7 @@ License: LGPL-2.1 and LGPL-3.0 and CC-BY-SA-3.0 and GPL-2.0 and BSL-1.0 and BSD-
|
||||
Group: System/Filesystems
|
||||
%endif
|
||||
URL: http://ceph.com/
|
||||
Source0: %{?_remote_tarball_prefix}ceph-16.2.14.tar.bz2
|
||||
Source0: %{?_remote_tarball_prefix}ceph-16.2.15.tar.bz2
|
||||
%if 0%{?suse_version}
|
||||
# _insert_obs_source_lines_here
|
||||
ExclusiveArch: x86_64 aarch64 ppc64le s390x
|
||||
@ -1208,7 +1208,7 @@ This package provides Ceph default alerts for Prometheus.
|
||||
# common
|
||||
#################################################################################
|
||||
%prep
|
||||
%autosetup -p1 -n ceph-16.2.14
|
||||
%autosetup -p1 -n ceph-16.2.15
|
||||
|
||||
%build
|
||||
# Disable lto on systems that do not support symver attribute
|
||||
|
@ -1,7 +1,13 @@
|
||||
ceph (16.2.14-1focal) focal; urgency=medium
|
||||
ceph (16.2.15-1focal) focal; urgency=medium
|
||||
|
||||
|
||||
-- Jenkins Build Slave User <jenkins-build@braggi13.front.sepia.ceph.com> Tue, 29 Aug 2023 16:38:35 +0000
|
||||
-- Jenkins Build Slave User <jenkins-build@braggi16.front.sepia.ceph.com> Mon, 26 Feb 2024 19:34:01 +0000
|
||||
|
||||
ceph (16.2.15-1) stable; urgency=medium
|
||||
|
||||
* New upstream release
|
||||
|
||||
-- Ceph Release Team <ceph-maintainers@ceph.io> Mon, 26 Feb 2024 19:21:07 +0000
|
||||
|
||||
ceph (16.2.14-1) stable; urgency=medium
|
||||
|
||||
|
@ -56,12 +56,13 @@ function(build_rocksdb)
|
||||
endif()
|
||||
include(CheckCXXCompilerFlag)
|
||||
check_cxx_compiler_flag("-Wno-deprecated-copy" HAS_WARNING_DEPRECATED_COPY)
|
||||
set(rocksdb_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
|
||||
if(HAS_WARNING_DEPRECATED_COPY)
|
||||
set(rocksdb_CXX_FLAGS -Wno-deprecated-copy)
|
||||
string(APPEND rocksdb_CXX_FLAGS " -Wno-deprecated-copy")
|
||||
endif()
|
||||
check_cxx_compiler_flag("-Wno-pessimizing-move" HAS_WARNING_PESSIMIZING_MOVE)
|
||||
if(HAS_WARNING_PESSIMIZING_MOVE)
|
||||
set(rocksdb_CXX_FLAGS "${rocksdb_CXX_FLAGS} -Wno-pessimizing-move")
|
||||
string(APPEND rocksdb_CXX_FLAGS " -Wno-pessimizing-move")
|
||||
endif()
|
||||
if(rocksdb_CXX_FLAGS)
|
||||
list(APPEND rocksdb_CMAKE_ARGS -DCMAKE_CXX_FLAGS='${rocksdb_CXX_FLAGS}')
|
||||
|
@ -15,7 +15,7 @@ creation of multiple file systems use ``ceph fs flag set enable_multiple true``.
|
||||
|
||||
::
|
||||
|
||||
fs new <file system name> <metadata pool name> <data pool name>
|
||||
ceph fs new <file system name> <metadata pool name> <data pool name>
|
||||
|
||||
This command creates a new file system. The file system name and metadata pool
|
||||
name are self-explanatory. The specified data pool is the default data pool and
|
||||
@ -25,13 +25,13 @@ to accommodate the new file system.
|
||||
|
||||
::
|
||||
|
||||
fs ls
|
||||
ceph fs ls
|
||||
|
||||
List all file systems by name.
|
||||
|
||||
::
|
||||
|
||||
fs dump [epoch]
|
||||
ceph fs dump [epoch]
|
||||
|
||||
This dumps the FSMap at the given epoch (default: current) which includes all
|
||||
file system settings, MDS daemons and the ranks they hold, and the list of
|
||||
@ -40,7 +40,7 @@ standby MDS daemons.
|
||||
|
||||
::
|
||||
|
||||
fs rm <file system name> [--yes-i-really-mean-it]
|
||||
ceph fs rm <file system name> [--yes-i-really-mean-it]
|
||||
|
||||
Destroy a CephFS file system. This wipes information about the state of the
|
||||
file system from the FSMap. The metadata pool and data pools are untouched and
|
||||
@ -48,28 +48,28 @@ must be destroyed separately.
|
||||
|
||||
::
|
||||
|
||||
fs get <file system name>
|
||||
ceph fs get <file system name>
|
||||
|
||||
Get information about the named file system, including settings and ranks. This
|
||||
is a subset of the same information from the ``fs dump`` command.
|
||||
is a subset of the same information from the ``ceph fs dump`` command.
|
||||
|
||||
::
|
||||
|
||||
fs set <file system name> <var> <val>
|
||||
ceph fs set <file system name> <var> <val>
|
||||
|
||||
Change a setting on a file system. These settings are specific to the named
|
||||
file system and do not affect other file systems.
|
||||
|
||||
::
|
||||
|
||||
fs add_data_pool <file system name> <pool name/id>
|
||||
ceph fs add_data_pool <file system name> <pool name/id>
|
||||
|
||||
Add a data pool to the file system. This pool can be used for file layouts
|
||||
as an alternate location to store file data.
|
||||
|
||||
::
|
||||
|
||||
fs rm_data_pool <file system name> <pool name/id>
|
||||
ceph fs rm_data_pool <file system name> <pool name/id>
|
||||
|
||||
This command removes the specified pool from the list of data pools for the
|
||||
file system. If any files have layouts for the removed data pool, the file
|
||||
@ -82,7 +82,7 @@ Settings
|
||||
|
||||
::
|
||||
|
||||
fs set <fs name> max_file_size <size in bytes>
|
||||
ceph fs set <fs name> max_file_size <size in bytes>
|
||||
|
||||
CephFS has a configurable maximum file size, and it's 1TB by default.
|
||||
You may wish to set this limit higher if you expect to store large files
|
||||
@ -116,13 +116,13 @@ Taking a CephFS cluster down is done by setting the down flag:
|
||||
|
||||
::
|
||||
|
||||
fs set <fs_name> down true
|
||||
ceph fs set <fs_name> down true
|
||||
|
||||
To bring the cluster back online:
|
||||
|
||||
::
|
||||
|
||||
fs set <fs_name> down false
|
||||
ceph fs set <fs_name> down false
|
||||
|
||||
This will also restore the previous value of max_mds. MDS daemons are brought
|
||||
down in a way such that journals are flushed to the metadata pool and all
|
||||
@ -133,11 +133,11 @@ Taking the cluster down rapidly for deletion or disaster recovery
|
||||
-----------------------------------------------------------------
|
||||
|
||||
To allow rapidly deleting a file system (for testing) or to quickly bring the
|
||||
file system and MDS daemons down, use the ``fs fail`` command:
|
||||
file system and MDS daemons down, use the ``ceph fs fail`` command:
|
||||
|
||||
::
|
||||
|
||||
fs fail <fs_name>
|
||||
ceph fs fail <fs_name>
|
||||
|
||||
This command sets a file system flag to prevent standbys from
|
||||
activating on the file system (the ``joinable`` flag).
|
||||
@ -146,7 +146,7 @@ This process can also be done manually by doing the following:
|
||||
|
||||
::
|
||||
|
||||
fs set <fs_name> joinable false
|
||||
ceph fs set <fs_name> joinable false
|
||||
|
||||
Then the operator can fail all of the ranks which causes the MDS daemons to
|
||||
respawn as standbys. The file system will be left in a degraded state.
|
||||
@ -154,7 +154,7 @@ respawn as standbys. The file system will be left in a degraded state.
|
||||
::
|
||||
|
||||
# For all ranks, 0-N:
|
||||
mds fail <fs_name>:<n>
|
||||
ceph mds fail <fs_name>:<n>
|
||||
|
||||
Once all ranks are inactive, the file system may also be deleted or left in
|
||||
this state for other purposes (perhaps disaster recovery).
|
||||
@ -163,7 +163,7 @@ To bring the cluster back up, simply set the joinable flag:
|
||||
|
||||
::
|
||||
|
||||
fs set <fs_name> joinable true
|
||||
ceph fs set <fs_name> joinable true
|
||||
|
||||
|
||||
Daemons
|
||||
@ -182,34 +182,35 @@ Commands to manipulate MDS daemons:
|
||||
|
||||
::
|
||||
|
||||
mds fail <gid/name/role>
|
||||
ceph mds fail <gid/name/role>
|
||||
|
||||
Mark an MDS daemon as failed. This is equivalent to what the cluster
|
||||
would do if an MDS daemon had failed to send a message to the mon
|
||||
for ``mds_beacon_grace`` second. If the daemon was active and a suitable
|
||||
standby is available, using ``mds fail`` will force a failover to the standby.
|
||||
standby is available, using ``ceph mds fail`` will force a failover to the
|
||||
standby.
|
||||
|
||||
If the MDS daemon was in reality still running, then using ``mds fail``
|
||||
If the MDS daemon was in reality still running, then using ``ceph mds fail``
|
||||
will cause the daemon to restart. If it was active and a standby was
|
||||
available, then the "failed" daemon will return as a standby.
|
||||
|
||||
|
||||
::
|
||||
|
||||
tell mds.<daemon name> command ...
|
||||
ceph tell mds.<daemon name> command ...
|
||||
|
||||
Send a command to the MDS daemon(s). Use ``mds.*`` to send a command to all
|
||||
daemons. Use ``ceph tell mds.* help`` to learn available commands.
|
||||
|
||||
::
|
||||
|
||||
mds metadata <gid/name/role>
|
||||
ceph mds metadata <gid/name/role>
|
||||
|
||||
Get metadata about the given MDS known to the Monitors.
|
||||
|
||||
::
|
||||
|
||||
mds repaired <role>
|
||||
ceph mds repaired <role>
|
||||
|
||||
Mark the file system rank as repaired. Unlike the name suggests, this command
|
||||
does not change a MDS; it manipulates the file system rank which has been
|
||||
@ -228,14 +229,14 @@ Commands to manipulate required client features of a file system:
|
||||
|
||||
::
|
||||
|
||||
fs required_client_features <fs name> add reply_encoding
|
||||
fs required_client_features <fs name> rm reply_encoding
|
||||
ceph fs required_client_features <fs name> add reply_encoding
|
||||
ceph fs required_client_features <fs name> rm reply_encoding
|
||||
|
||||
To list all CephFS features
|
||||
|
||||
::
|
||||
|
||||
fs feature ls
|
||||
ceph fs feature ls
|
||||
|
||||
Clients that are missing newly added features will be evicted automatically.
|
||||
|
||||
@ -330,7 +331,7 @@ Global settings
|
||||
|
||||
::
|
||||
|
||||
fs flag set <flag name> <flag val> [<confirmation string>]
|
||||
ceph fs flag set <flag name> <flag val> [<confirmation string>]
|
||||
|
||||
Sets a global CephFS flag (i.e. not specific to a particular file system).
|
||||
Currently, the only flag setting is 'enable_multiple' which allows having
|
||||
@ -352,13 +353,13 @@ file system.
|
||||
|
||||
::
|
||||
|
||||
mds rmfailed
|
||||
ceph mds rmfailed
|
||||
|
||||
This removes a rank from the failed set.
|
||||
|
||||
::
|
||||
|
||||
fs reset <file system name>
|
||||
ceph fs reset <file system name>
|
||||
|
||||
This command resets the file system state to defaults, except for the name and
|
||||
pools. Non-zero ranks are saved in the stopped set.
|
||||
@ -366,7 +367,7 @@ pools. Non-zero ranks are saved in the stopped set.
|
||||
|
||||
::
|
||||
|
||||
fs new <file system name> <metadata pool name> <data pool name> --fscid <fscid> --force
|
||||
ceph fs new <file system name> <metadata pool name> <data pool name> --fscid <fscid> --force
|
||||
|
||||
This command creates a file system with a specific **fscid** (file system cluster ID).
|
||||
You may want to do this when an application expects the file system's ID to be
|
||||
|
@ -37,7 +37,7 @@ Options :
|
||||
.. code:: bash
|
||||
|
||||
[build]$ python3 -m venv venv && source venv/bin/activate && pip3 install cmd2
|
||||
[build]$ source vstart_environment.sh && source venv/bin/activate && python3 ../src/tools/cephfs/cephfs-shell
|
||||
[build]$ source vstart_environment.sh && source venv/bin/activate && python3 ../src/tools/cephfs/shell/cephfs-shell
|
||||
|
||||
Commands
|
||||
========
|
||||
|
@ -24,6 +24,16 @@ that directory.
|
||||
To restrict clients to only mount and work within a certain directory, use
|
||||
path-based MDS authentication capabilities.
|
||||
|
||||
Note that this restriction *only* impacts the filesystem hierarchy -- the metadata
|
||||
tree managed by the MDS. Clients will still be able to access the underlying
|
||||
file data in RADOS directly. To segregate clients fully, you must also isolate
|
||||
untrusted clients in their own RADOS namespace. You can place a client's
|
||||
filesystem subtree in a particular namespace using `file layouts`_ and then
|
||||
restrict their RADOS access to that namespace using `OSD capabilities`_
|
||||
|
||||
.. _file layouts: ./file-layouts
|
||||
.. _OSD capabilities: ../rados/operations/user-management/#authorization-capabilities
|
||||
|
||||
Syntax
|
||||
------
|
||||
|
||||
|
@ -38,6 +38,13 @@ below). By default
|
||||
the start time is last midnight. So when a snapshot schedule with repeat
|
||||
interval `1h` is added at 13:50
|
||||
with the default start time, the first snapshot will be taken at 14:00.
|
||||
The time zone is assumed to be UTC if none is explicitly included in the string.
|
||||
An explicit time zone will be mapped to UTC at execution.
|
||||
The start time must be in ISO8601 format. Examples below:
|
||||
|
||||
UTC: 2022-08-08T05:30:00 i.e. 5:30 AM UTC, without explicit time zone offset
|
||||
IDT: 2022-08-08T09:00:00+03:00 i.e. 6:00 AM UTC
|
||||
EDT: 2022-08-08T05:30:00-04:00 i.e. 9:30 AM UTC
|
||||
|
||||
Retention specifications are identified by path and the retention spec itself. A
|
||||
retention spec consists of either a number and a time period separated by a
|
||||
@ -155,6 +162,11 @@ Examples::
|
||||
snapshot creation is accounted for in the "created_count" field, which is a
|
||||
cumulative count of the total number of snapshots created so far.
|
||||
|
||||
.. note: The maximum number of snapshots to retain per directory is limited by the
|
||||
config tunable `mds_max_snaps_per_dir`. This tunable defaults to 100.
|
||||
To ensure a new snapshot can be created, one snapshot less than this will be
|
||||
retained. So by default, a maximum of 99 snapshots will be retained.
|
||||
|
||||
Active and inactive schedules
|
||||
-----------------------------
|
||||
Snapshot schedules can be added for a path that doesn't exist yet in the
|
||||
|
@ -60,6 +60,8 @@ Possible -op commands::
|
||||
* meta-list
|
||||
* get-osdmap
|
||||
* set-osdmap
|
||||
* get-superblock
|
||||
* set-superblock
|
||||
* get-inc-osdmap
|
||||
* set-inc-osdmap
|
||||
* mark-complete
|
||||
@ -414,7 +416,7 @@ Options
|
||||
|
||||
.. option:: --op arg
|
||||
|
||||
Arg is one of [info, log, remove, mkfs, fsck, repair, fuse, dup, export, export-remove, import, list, fix-lost, list-pgs, dump-journal, dump-super, meta-list, get-osdmap, set-osdmap, get-inc-osdmap, set-inc-osdmap, mark-complete, reset-last-complete, apply-layout-settings, update-mon-db, dump-export, trim-pg-log]
|
||||
Arg is one of [info, log, remove, mkfs, fsck, repair, fuse, dup, export, export-remove, import, list, fix-lost, list-pgs, dump-journal, dump-super, meta-list, get-osdmap, set-osdmap, get-superblock, set-superblock, get-inc-osdmap, set-inc-osdmap, mark-complete, reset-last-complete, apply-layout-settings, update-mon-db, dump-export, trim-pg-log]
|
||||
|
||||
.. option:: --epoch arg
|
||||
|
||||
@ -422,7 +424,7 @@ Options
|
||||
|
||||
.. option:: --file arg
|
||||
|
||||
path of file to export, export-remove, import, get-osdmap, set-osdmap, get-inc-osdmap or set-inc-osdmap
|
||||
path of file to export, export-remove, import, get-osdmap, set-osdmap, get-superblock, set-superblock, get-inc-osdmap or set-inc-osdmap
|
||||
|
||||
.. option:: --mon-store-path arg
|
||||
|
||||
|
@ -1314,7 +1314,7 @@ Subcommand ``cache-mode`` specifies the caching mode for cache tier <pool>.
|
||||
|
||||
Usage::
|
||||
|
||||
ceph osd tier cache-mode <poolname> writeback|readproxy|readonly|none
|
||||
ceph osd tier cache-mode <poolname> writeback|proxy|readproxy|readonly|none
|
||||
|
||||
Subcommand ``remove`` removes the tier <tierpool> (the second one) from base pool
|
||||
<pool> (the first one).
|
||||
|
@ -264,8 +264,8 @@ Pool specific commands
|
||||
:command:`append` *name* *infile*
|
||||
Append object name to the cluster with contents from infile.
|
||||
|
||||
:command:`rm` *name*
|
||||
Remove object name.
|
||||
:command:`rm` [--force-full] *name* ...
|
||||
Remove object(s) with name(s). With ``--force-full`` will remove when cluster is marked full.
|
||||
|
||||
:command:`listwatchers` *name*
|
||||
List the watchers of object name.
|
||||
|
@ -333,7 +333,7 @@ OSD and run the following command:
|
||||
|
||||
ceph-bluestore-tool \
|
||||
--path <data path> \
|
||||
--sharding="m(3) p(3,0-12) o(3,0-13)=block_cache={type=binned_lru} l p" \
|
||||
--sharding="m(3) p(3,0-12) O(3,0-13)=block_cache={type=binned_lru} L P" \
|
||||
reshard
|
||||
|
||||
|
||||
|
@ -109,17 +109,6 @@ Async messenger options
|
||||
:Default: ``3``
|
||||
|
||||
|
||||
``ms_async_max_op_threads``
|
||||
|
||||
:Description: Maximum number of worker threads used by each Async Messenger instance.
|
||||
Set to lower values when your machine has limited CPU count, and increase
|
||||
when your CPUs are underutilized (i. e. one or more of CPUs are
|
||||
constantly on 100% load during I/O operations).
|
||||
:Type: 64-bit Unsigned Integer
|
||||
:Required: No
|
||||
:Default: ``5``
|
||||
|
||||
|
||||
``ms_async_send_inline``
|
||||
|
||||
:Description: Send messages directly from the thread that generated them instead of
|
||||
@ -129,5 +118,3 @@ Async messenger options
|
||||
:Type: Boolean
|
||||
:Required: No
|
||||
:Default: ``false``
|
||||
|
||||
|
||||
|
@ -4,12 +4,41 @@
|
||||
|
||||
.. index:: pools; configuration
|
||||
|
||||
Ceph uses default values to determine how many placement groups (PGs) will be
|
||||
assigned to each pool. We recommend overriding some of the defaults.
|
||||
Specifically, we recommend setting a pool's replica size and overriding the
|
||||
default number of placement groups. You can set these values when running
|
||||
`pool`_ commands. You can also override the defaults by adding new ones in the
|
||||
``[global]`` section of your Ceph configuration file.
|
||||
The number of placement groups that the CRUSH algorithm assigns to each pool is
|
||||
determined by the values of variables in the centralized configuration database
|
||||
in the monitor cluster.
|
||||
|
||||
Both containerized deployments of Ceph (deployments made using ``cephadm`` or
|
||||
Rook) and non-containerized deployments of Ceph rely on the values in the
|
||||
central configuration database in the monitor cluster to assign placement
|
||||
groups to pools.
|
||||
|
||||
Example Commands
|
||||
----------------
|
||||
|
||||
To see the value of the variable that governs the number of placement groups in a given pool, run a command of the following form:
|
||||
|
||||
.. prompt:: bash
|
||||
|
||||
ceph config get osd osd_pool_default_pg_num
|
||||
|
||||
To set the value of the variable that governs the number of placement groups in a given pool, run a command of the following form:
|
||||
|
||||
.. prompt:: bash
|
||||
|
||||
ceph config set osd osd_pool_default_pg_num
|
||||
|
||||
Manual Tuning
|
||||
-------------
|
||||
In some cases, it might be advisable to override some of the defaults. For
|
||||
example, you might determine that it is wise to set a pool's replica size and
|
||||
to override the default number of placement groups in the pool. You can set
|
||||
these values when running `pool`_ commands.
|
||||
|
||||
See Also
|
||||
--------
|
||||
|
||||
See :ref:`pg-autoscaler`.
|
||||
|
||||
|
||||
.. literalinclude:: pool-pg.conf
|
||||
|
@ -1404,6 +1404,31 @@ other performance issue with the OSDs.
|
||||
The exact size of the snapshot trim queue is reported by the ``snaptrimq_len``
|
||||
field of ``ceph pg ls -f json-detail``.
|
||||
|
||||
Stretch Mode
|
||||
------------
|
||||
|
||||
INCORRECT_NUM_BUCKETS_STRETCH_MODE
|
||||
__________________________________
|
||||
|
||||
Stretch mode currently only support 2 dividing buckets with OSDs, this warning suggests
|
||||
that the number of dividing buckets is not equal to 2 after stretch mode is enabled.
|
||||
You can expect unpredictable failures and MON assertions until the condition is fixed.
|
||||
|
||||
We encourage you to fix this by removing additional dividing buckets or bump the
|
||||
number of dividing buckets to 2.
|
||||
|
||||
UNEVEN_WEIGHTS_STRETCH_MODE
|
||||
___________________________
|
||||
|
||||
The 2 dividing buckets must have equal weights when stretch mode is enabled.
|
||||
This warning suggests that the 2 dividing buckets have uneven weights after
|
||||
stretch mode is enabled. This is not immediately fatal, however, you can expect
|
||||
Ceph to be confused when trying to process transitions between dividing buckets.
|
||||
|
||||
We encourage you to fix this by making the weights even on both dividing buckets.
|
||||
This can be done by making sure the combined weight of the OSDs on each dividing
|
||||
bucket are the same.
|
||||
|
||||
Miscellaneous
|
||||
-------------
|
||||
|
||||
|
@ -127,6 +127,14 @@ Options
|
||||
:Type: Integer
|
||||
:Default: ``65000``
|
||||
|
||||
``max_header_size``
|
||||
|
||||
:Description: The maximum number of header bytes available for a single request.
|
||||
|
||||
:Type: Integer
|
||||
:Default: ``16384``
|
||||
:Maximum: ``65536``
|
||||
|
||||
|
||||
Civetweb
|
||||
========
|
||||
|
@ -55,7 +55,7 @@ download_from() {
|
||||
exit
|
||||
fi
|
||||
url=$url_base/$fname
|
||||
wget -c --no-verbose -O $fname $url
|
||||
wget --no-verbose -O $fname $url
|
||||
if [ $? != 0 -o ! -e $fname ]; then
|
||||
echo "Download of $url failed"
|
||||
elif [ $(sha256sum $fname | awk '{print $1}') != $sha256 ]; then
|
||||
@ -183,8 +183,7 @@ download_boost $boost_version 4eb3b8d442b426dc35346235c8733b5ae35ba431690e38c6a8
|
||||
https://boostorg.jfrog.io/artifactory/main/release/$boost_version/source \
|
||||
https://downloads.sourceforge.net/project/boost/boost/$boost_version \
|
||||
https://download.ceph.com/qa
|
||||
download_liburing 0.7 8e2842cfe947f3a443af301bdd6d034455536c38a455c7a700d0c1ad165a7543 \
|
||||
https://github.com/axboe/liburing/archive \
|
||||
download_liburing 0.7 05d0cf8493d573c76b11abfcf34aabc7153affebe17ff95f9ae88b0de062a59d \
|
||||
https://git.kernel.dk/cgit/liburing/snapshot
|
||||
pmdk_version=1.10
|
||||
download_pmdk $pmdk_version 08dafcf94db5ac13fac9139c92225d9aa5f3724ea74beee4e6ca19a01a2eb20c \
|
||||
|
@ -342,7 +342,7 @@ local g = import 'grafonnet/grafana.libsonnet';
|
||||
$.graphPanelSchema({},
|
||||
title,
|
||||
description,
|
||||
'null',
|
||||
'null as zero',
|
||||
false,
|
||||
formatY1,
|
||||
'short',
|
||||
|
@ -133,7 +133,7 @@ local u = import 'utils.libsonnet';
|
||||
$.graphPanelSchema({},
|
||||
title,
|
||||
'',
|
||||
'null',
|
||||
'null as zero',
|
||||
false,
|
||||
formatY1,
|
||||
'short',
|
||||
|
@ -140,7 +140,7 @@ local u = import 'utils.libsonnet';
|
||||
{},
|
||||
title,
|
||||
description,
|
||||
'null',
|
||||
'null as zero',
|
||||
false,
|
||||
formatY1,
|
||||
formatY2,
|
||||
@ -658,7 +658,7 @@ local u = import 'utils.libsonnet';
|
||||
$.graphPanelSchema(aliasColors,
|
||||
title,
|
||||
description,
|
||||
'null',
|
||||
'null as zero',
|
||||
false,
|
||||
formatY1,
|
||||
formatY2,
|
||||
|
@ -87,7 +87,7 @@
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null",
|
||||
"nullPointMode": "null as zero",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
@ -185,7 +185,7 @@
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null",
|
||||
"nullPointMode": "null as zero",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
@ -283,7 +283,7 @@
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null",
|
||||
"nullPointMode": "null as zero",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
@ -400,7 +400,7 @@
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null",
|
||||
"nullPointMode": "null as zero",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
@ -498,7 +498,7 @@
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null",
|
||||
"nullPointMode": "null as zero",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
@ -596,7 +596,7 @@
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null",
|
||||
"nullPointMode": "null as zero",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
|
@ -93,7 +93,7 @@
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null",
|
||||
"nullPointMode": "null as zero",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
@ -186,7 +186,7 @@
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null",
|
||||
"nullPointMode": "null as zero",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
@ -285,7 +285,7 @@
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null",
|
||||
"nullPointMode": "null as zero",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
|
@ -87,7 +87,7 @@
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null",
|
||||
"nullPointMode": "null as zero",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
@ -180,7 +180,7 @@
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null",
|
||||
"nullPointMode": "null as zero",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
@ -266,7 +266,7 @@
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null",
|
||||
"nullPointMode": "null as zero",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
@ -352,7 +352,7 @@
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null",
|
||||
"nullPointMode": "null as zero",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
@ -445,7 +445,7 @@
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null",
|
||||
"nullPointMode": "null as zero",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
@ -531,7 +531,7 @@
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null",
|
||||
"nullPointMode": "null as zero",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
@ -636,7 +636,7 @@
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null",
|
||||
"nullPointMode": "null as zero",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
@ -754,7 +754,7 @@
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null",
|
||||
"nullPointMode": "null as zero",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
@ -893,7 +893,7 @@
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null",
|
||||
"nullPointMode": "null as zero",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
@ -1000,7 +1000,7 @@
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null",
|
||||
"nullPointMode": "null as zero",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
|
@ -80,7 +80,7 @@
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null",
|
||||
"nullPointMode": "null as zero",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
@ -173,7 +173,7 @@
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null",
|
||||
"nullPointMode": "null as zero",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
@ -266,7 +266,7 @@
|
||||
"lines": true,
|
||||
"linewidth": 1,
|
||||
"links": [ ],
|
||||
"nullPointMode": "null",
|
||||
"nullPointMode": "null as zero",
|
||||
"percentage": false,
|
||||
"pointradius": 5,
|
||||
"points": false,
|
||||
|
@ -518,7 +518,7 @@ groups:
|
||||
annotations:
|
||||
description: "Pool '{{ $labels.name }}' will be full in less than 5 days assuming the average fill-up rate of the past 48 hours."
|
||||
summary: "Pool growth rate may soon exceed capacity"
|
||||
expr: "(predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(pool_id) group_right ceph_pool_metadata) >= 95"
|
||||
expr: "(predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(pool_id, instance) group_right() ceph_pool_metadata) >= 95"
|
||||
labels:
|
||||
oid: "1.3.6.1.4.1.50495.1.2.1.9.2"
|
||||
severity: "warning"
|
||||
|
@ -1499,35 +1499,44 @@ tests:
|
||||
# trigger percent full prediction on pools 1 and 2 only
|
||||
- interval: 12h
|
||||
input_series:
|
||||
- series: 'ceph_pool_percent_used{pool_id="1"}'
|
||||
values: '70 75 80 87 92'
|
||||
- series: 'ceph_pool_percent_used{pool_id="2"}'
|
||||
values: '22 22 23 23 24'
|
||||
- series: 'ceph_pool_metadata{pool_id="1",name="rbd",type="replicated"}'
|
||||
- series: 'ceph_pool_percent_used{pool_id="1", instance="9090"}'
|
||||
values: '1 1 1 1 1'
|
||||
- series: 'ceph_pool_metadata{pool_id="2",name="default.rgw.index",type="replicated"}'
|
||||
- series: 'ceph_pool_percent_used{pool_id="1", instance="8090"}'
|
||||
values: '78 89 79 98 78'
|
||||
- series: 'ceph_pool_percent_used{pool_id="2", instance="9090"}'
|
||||
values: '1 1 1 1 1'
|
||||
- series: 'ceph_pool_percent_used{pool_id="2", instance="8090"}'
|
||||
values: '22 22 23 23 24'
|
||||
- series: 'ceph_pool_metadata{pool_id="1" , instance="9090" ,name="rbd",type="replicated"}'
|
||||
values: '1 1 1 1 1'
|
||||
- series: 'ceph_pool_metadata{pool_id="1", instance="8090",name="default.rgw.index",type="replicated"}'
|
||||
values: '1 1 1 1 1'
|
||||
- series: 'ceph_pool_metadata{pool_id="2" , instance="9090" ,name="rbd",type="replicated"}'
|
||||
values: '1 1 1 1 1'
|
||||
- series: 'ceph_pool_metadata{pool_id="2", instance="8090",name="default.rgw.index",type="replicated"}'
|
||||
values: '1 1 1 1 1'
|
||||
promql_expr_test:
|
||||
- expr: |
|
||||
(predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(pool_id)
|
||||
group_right ceph_pool_metadata) >= 95
|
||||
(predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(pool_id, instance)
|
||||
group_right() ceph_pool_metadata) >= 95
|
||||
eval_time: 36h
|
||||
exp_samples:
|
||||
- labels: '{name="rbd",pool_id="1",type="replicated"}'
|
||||
value: 1.424E+02 # 142%
|
||||
- labels: '{instance="8090",name="default.rgw.index",pool_id="1",type="replicated"}'
|
||||
value: 1.435E+02 # 142%
|
||||
alert_rule_test:
|
||||
- eval_time: 48h
|
||||
alertname: CephPoolGrowthWarning
|
||||
exp_alerts:
|
||||
- exp_labels:
|
||||
name: rbd
|
||||
instance: 8090
|
||||
name: default.rgw.index
|
||||
pool_id: 1
|
||||
severity: warning
|
||||
type: ceph_default
|
||||
oid: 1.3.6.1.4.1.50495.1.2.1.9.2
|
||||
exp_annotations:
|
||||
summary: Pool growth rate may soon exceed capacity
|
||||
description: Pool 'rbd' will be full in less than 5 days assuming the average fill-up rate of the past 48 hours.
|
||||
description: Pool 'default.rgw.index' will be full in less than 5 days assuming the average fill-up rate of the past 48 hours.
|
||||
- interval: 1m
|
||||
input_series:
|
||||
- series: 'ceph_health_detail{name="POOL_BACKFILLFULL"}'
|
||||
|
@ -3,6 +3,7 @@ overrides:
|
||||
conf:
|
||||
mds:
|
||||
debug mds: 20
|
||||
debug mds balancer: 20
|
||||
debug ms: 1
|
||||
mds debug frag: true
|
||||
mds debug scatterstat: true
|
||||
|
@ -2,7 +2,10 @@ overrides:
|
||||
ceph:
|
||||
log-ignorelist:
|
||||
- overall HEALTH_
|
||||
- \(CEPHADM_STRAY_DAEMON\)
|
||||
- \(FS_DEGRADED\)
|
||||
- FS_
|
||||
- \(CEPHADM_
|
||||
- \(MDS_FAILED\)
|
||||
- \(MDS_DEGRADED\)
|
||||
- \(FS_WITH_FAILED_MDS\)
|
||||
@ -10,3 +13,10 @@ overrides:
|
||||
- \(MDS_ALL_DOWN\)
|
||||
- \(MDS_UP_LESS_THAN_MAX\)
|
||||
- \(FS_INLINE_DATA_DEPRECATED\)
|
||||
- \(PG_DEGRADED\)
|
||||
- Degraded data redundancy
|
||||
- \(PG_
|
||||
- acting
|
||||
- MDS_INSUFFICIENT_STANDBY
|
||||
- deprecated feature inline_data
|
||||
- compat changed unexpectedly
|
||||
|
@ -2,8 +2,10 @@ overrides:
|
||||
ceph:
|
||||
log-ignorelist:
|
||||
- overall HEALTH_
|
||||
- \(OSD_DOWN\)
|
||||
- \(OSD_
|
||||
- OSD_DOWN
|
||||
- OSD_
|
||||
- but it is still running
|
||||
# MDS daemon 'b' is not responding, replacing it as rank 0 with standby 'a'
|
||||
- is not responding
|
||||
- is down
|
||||
- osds down
|
||||
|
6
ceph/qa/distros/all/rhel_8.5.yaml
Normal file
6
ceph/qa/distros/all/rhel_8.5.yaml
Normal file
@ -0,0 +1,6 @@
|
||||
os_type: rhel
|
||||
os_version: "8.5"
|
||||
overrides:
|
||||
selinux:
|
||||
whitelist:
|
||||
- scontext=system_u:system_r:logrotate_t:s0
|
6
ceph/qa/distros/all/rhel_8.6.yaml
Normal file
6
ceph/qa/distros/all/rhel_8.6.yaml
Normal file
@ -0,0 +1,6 @@
|
||||
os_type: rhel
|
||||
os_version: "8.6"
|
||||
overrides:
|
||||
selinux:
|
||||
whitelist:
|
||||
- scontext=system_u:system_r:logrotate_t:s0
|
@ -1 +1 @@
|
||||
rhel_8.4.yaml
|
||||
rhel_8.6.yaml
|
@ -1 +0,0 @@
|
||||
.qa/distros/podman/rhel_8.4_container_tools_3.0.yaml
|
@ -1 +0,0 @@
|
||||
.qa/distros/podman/rhel_8.4_container_tools_rhel8.yaml
|
@ -0,0 +1 @@
|
||||
.qa/distros/podman/rhel_8.6_container_tools_3.0.yaml
|
@ -0,0 +1 @@
|
||||
.qa/distros/podman/rhel_8.6_container_tools_rhel8.yaml
|
@ -1,5 +1,5 @@
|
||||
os_type: rhel
|
||||
os_version: "8.4"
|
||||
os_version: "8.6"
|
||||
overrides:
|
||||
selinux:
|
||||
whitelist:
|
@ -1,5 +1,5 @@
|
||||
os_type: rhel
|
||||
os_version: "8.4"
|
||||
os_version: "8.6"
|
||||
overrides:
|
||||
selinux:
|
||||
whitelist:
|
@ -1691,6 +1691,29 @@ function test_wait_for_peered() {
|
||||
|
||||
#######################################################################
|
||||
|
||||
##
|
||||
# Wait until the cluster's health condition disappeared.
|
||||
# $TIMEOUT default
|
||||
#
|
||||
# @param string to grep for in health detail
|
||||
# @return 0 if the cluster health doesn't matches request,
|
||||
# 1 otherwise if after $TIMEOUT seconds health condition remains.
|
||||
#
|
||||
function wait_for_health_gone() {
|
||||
local grepstr=$1
|
||||
local -a delays=($(get_timeout_delays $TIMEOUT .1))
|
||||
local -i loop=0
|
||||
|
||||
while ceph health detail | grep "$grepstr" ; do
|
||||
if (( $loop >= ${#delays[*]} )) ; then
|
||||
ceph health detail
|
||||
return 1
|
||||
fi
|
||||
sleep ${delays[$loop]}
|
||||
loop+=1
|
||||
done
|
||||
}
|
||||
|
||||
##
|
||||
# Wait until the cluster has health condition passed as arg
|
||||
# again for $TIMEOUT seconds.
|
||||
|
148
ceph/qa/standalone/mon-stretch/mon-stretch-fail-recovery.sh
Executable file
148
ceph/qa/standalone/mon-stretch/mon-stretch-fail-recovery.sh
Executable file
@ -0,0 +1,148 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
|
||||
function run() {
|
||||
local dir=$1
|
||||
shift
|
||||
|
||||
export CEPH_MON_A="127.0.0.1:7139" # git grep '\<7139\>' : there must be only one
|
||||
export CEPH_MON_B="127.0.0.1:7141" # git grep '\<7141\>' : there must be only one
|
||||
export CEPH_MON_C="127.0.0.1:7142" # git grep '\<7142\>' : there must be only one
|
||||
export CEPH_MON_D="127.0.0.1:7143" # git grep '\<7143\>' : there must be only one
|
||||
export CEPH_MON_E="127.0.0.1:7144" # git grep '\<7144\>' : there must be only one
|
||||
export CEPH_ARGS
|
||||
CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
|
||||
|
||||
export BASE_CEPH_ARGS=$CEPH_ARGS
|
||||
CEPH_ARGS+="--mon-host=$CEPH_MON_A"
|
||||
|
||||
local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
|
||||
for func in $funcs ; do
|
||||
setup $dir || return 1
|
||||
$func $dir || return 1
|
||||
teardown $dir || return 1
|
||||
done
|
||||
}
|
||||
TEST_stretched_cluster_failover_add_three_osds(){
|
||||
local dir=$1
|
||||
local OSDS=8
|
||||
setup $dir || return 1
|
||||
|
||||
run_mon $dir a --public-addr $CEPH_MON_A || return 1
|
||||
wait_for_quorum 300 1 || return 1
|
||||
|
||||
run_mon $dir b --public-addr $CEPH_MON_B || return 1
|
||||
CEPH_ARGS="$BASE_CEPH_ARGS --mon-host=$CEPH_MON_A,$CEPH_MON_B"
|
||||
wait_for_quorum 300 2 || return 1
|
||||
|
||||
run_mon $dir c --public-addr $CEPH_MON_C || return 1
|
||||
CEPH_ARGS="$BASE_CEPH_ARGS --mon-host=$CEPH_MON_A,$CEPH_MON_B,$CEPH_MON_C"
|
||||
wait_for_quorum 300 3 || return 1
|
||||
|
||||
run_mon $dir d --public-addr $CEPH_MON_D || return 1
|
||||
CEPH_ARGS="$BASE_CEPH_ARGS --mon-host=$CEPH_MON_A,$CEPH_MON_B,$CEPH_MON_C,$CEPH_MON_D"
|
||||
wait_for_quorum 300 4 || return 1
|
||||
|
||||
run_mon $dir e --public-addr $CEPH_MON_E || return 1
|
||||
CEPH_ARGS="$BASE_CEPH_ARGS --mon-host=$CEPH_MON_A,$CEPH_MON_B,$CEPH_MON_C,$CEPH_MON_D,$CEPH_MON_E"
|
||||
wait_for_quorum 300 5 || return 1
|
||||
|
||||
ceph mon set election_strategy connectivity
|
||||
ceph mon add disallowed_leader e
|
||||
|
||||
run_mgr $dir x || return 1
|
||||
run_mgr $dir y || return 1
|
||||
run_mgr $dir z || return 1
|
||||
|
||||
for osd in $(seq 0 $(expr $OSDS - 1))
|
||||
do
|
||||
run_osd $dir $osd || return 1
|
||||
done
|
||||
|
||||
for zone in iris pze
|
||||
do
|
||||
ceph osd crush add-bucket $zone zone
|
||||
ceph osd crush move $zone root=default
|
||||
done
|
||||
|
||||
|
||||
ceph osd crush add-bucket node-2 host
|
||||
ceph osd crush add-bucket node-3 host
|
||||
ceph osd crush add-bucket node-4 host
|
||||
ceph osd crush add-bucket node-5 host
|
||||
|
||||
ceph osd crush move node-2 zone=iris
|
||||
ceph osd crush move node-3 zone=iris
|
||||
ceph osd crush move node-4 zone=pze
|
||||
ceph osd crush move node-5 zone=pze
|
||||
|
||||
ceph osd crush move osd.0 host=node-2
|
||||
ceph osd crush move osd.1 host=node-2
|
||||
ceph osd crush move osd.2 host=node-3
|
||||
ceph osd crush move osd.3 host=node-3
|
||||
ceph osd crush move osd.4 host=node-4
|
||||
ceph osd crush move osd.5 host=node-4
|
||||
ceph osd crush move osd.6 host=node-5
|
||||
ceph osd crush move osd.7 host=node-5
|
||||
|
||||
ceph mon set_location a zone=iris host=node-2
|
||||
ceph mon set_location b zone=iris host=node-3
|
||||
ceph mon set_location c zone=pze host=node-4
|
||||
ceph mon set_location d zone=pze host=node-5
|
||||
|
||||
hostname=$(hostname -s)
|
||||
ceph osd crush remove $hostname || return 1
|
||||
ceph osd getcrushmap > crushmap || return 1
|
||||
crushtool --decompile crushmap > crushmap.txt || return 1
|
||||
sed 's/^# end crush map$//' crushmap.txt > crushmap_modified.txt || return 1
|
||||
cat >> crushmap_modified.txt << EOF
|
||||
rule stretch_rule {
|
||||
id 1
|
||||
type replicated
|
||||
min_size 1
|
||||
max_size 10
|
||||
step take iris
|
||||
step chooseleaf firstn 2 type host
|
||||
step emit
|
||||
step take pze
|
||||
step chooseleaf firstn 2 type host
|
||||
step emit
|
||||
}
|
||||
|
||||
# end crush map
|
||||
EOF
|
||||
|
||||
crushtool --compile crushmap_modified.txt -o crushmap.bin || return 1
|
||||
ceph osd setcrushmap -i crushmap.bin || return 1
|
||||
local stretched_poolname=stretched_rbdpool
|
||||
ceph osd pool create $stretched_poolname 32 32 stretch_rule || return 1
|
||||
ceph osd pool set $stretched_poolname size 4 || return 1
|
||||
|
||||
sleep 3
|
||||
|
||||
ceph mon set_location e zone=arbiter host=node-1
|
||||
ceph mon enable_stretch_mode e stretch_rule zone
|
||||
|
||||
kill_daemons $dir KILL mon.c || return 1
|
||||
kill_daemons $dir KILL mon.d || return 1
|
||||
|
||||
kill_daemons $dir KILL osd.4 || return 1
|
||||
kill_daemons $dir KILL osd.5 || return 1
|
||||
kill_daemons $dir KILL osd.6 || return 1
|
||||
kill_daemons $dir KILL osd.7 || return 1
|
||||
|
||||
ceph -s
|
||||
|
||||
sleep 3
|
||||
|
||||
run_osd $dir 8 || return 1
|
||||
run_osd $dir 9 || return 1
|
||||
run_osd $dir 10 || return 1
|
||||
|
||||
ceph -s
|
||||
|
||||
sleep 3
|
||||
|
||||
teardown $dir || return 1
|
||||
}
|
||||
main mon-stretch-fail-recovery "$@"
|
145
ceph/qa/standalone/mon-stretch/mon-stretch-uneven-crush-weights.sh
Executable file
145
ceph/qa/standalone/mon-stretch/mon-stretch-uneven-crush-weights.sh
Executable file
@ -0,0 +1,145 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
|
||||
function run() {
|
||||
local dir=$1
|
||||
shift
|
||||
|
||||
export CEPH_MON_A="127.0.0.1:7139" # git grep '\<7139\>' : there must be only one
|
||||
export CEPH_MON_B="127.0.0.1:7141" # git grep '\<7141\>' : there must be only one
|
||||
export CEPH_MON_C="127.0.0.1:7142" # git grep '\<7142\>' : there must be only one
|
||||
export CEPH_MON_D="127.0.0.1:7143" # git grep '\<7143\>' : there must be only one
|
||||
export CEPH_MON_E="127.0.0.1:7144" # git grep '\<7144\>' : there must be only one
|
||||
export CEPH_ARGS
|
||||
CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
|
||||
|
||||
export BASE_CEPH_ARGS=$CEPH_ARGS
|
||||
CEPH_ARGS+="--mon-host=$CEPH_MON_A"
|
||||
|
||||
local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
|
||||
for func in $funcs ; do
|
||||
setup $dir || return 1
|
||||
$func $dir || return 1
|
||||
teardown $dir || return 1
|
||||
done
|
||||
}
|
||||
TEST_stretched_cluster_uneven_weight() {
|
||||
local dir=$1
|
||||
local OSDS=4
|
||||
local weight=0.09000
|
||||
setup $dir || return 1
|
||||
|
||||
run_mon $dir a --public-addr $CEPH_MON_A || return 1
|
||||
wait_for_quorum 300 1 || return 1
|
||||
|
||||
run_mon $dir b --public-addr $CEPH_MON_B || return 1
|
||||
CEPH_ARGS="$BASE_CEPH_ARGS --mon-host=$CEPH_MON_A,$CEPH_MON_B"
|
||||
wait_for_quorum 300 2 || return 1
|
||||
|
||||
run_mon $dir c --public-addr $CEPH_MON_C || return 1
|
||||
CEPH_ARGS="$BASE_CEPH_ARGS --mon-host=$CEPH_MON_A,$CEPH_MON_B,$CEPH_MON_C"
|
||||
wait_for_quorum 300 3 || return 1
|
||||
|
||||
run_mon $dir d --public-addr $CEPH_MON_D || return 1
|
||||
CEPH_ARGS="$BASE_CEPH_ARGS --mon-host=$CEPH_MON_A,$CEPH_MON_B,$CEPH_MON_C,$CEPH_MON_D"
|
||||
wait_for_quorum 300 4 || return 1
|
||||
|
||||
run_mon $dir e --public-addr $CEPH_MON_E || return 1
|
||||
CEPH_ARGS="$BASE_CEPH_ARGS --mon-host=$CEPH_MON_A,$CEPH_MON_B,$CEPH_MON_C,$CEPH_MON_D,$CEPH_MON_E"
|
||||
wait_for_quorum 300 5 || return 1
|
||||
|
||||
ceph mon set election_strategy connectivity
|
||||
ceph mon add disallowed_leader e
|
||||
|
||||
run_mgr $dir x || return 1
|
||||
run_mgr $dir y || return 1
|
||||
run_mgr $dir z || return 1
|
||||
|
||||
for osd in $(seq 0 $(expr $OSDS - 1))
|
||||
do
|
||||
run_osd $dir $osd || return 1
|
||||
done
|
||||
|
||||
for zone in iris pze
|
||||
do
|
||||
ceph osd crush add-bucket $zone zone
|
||||
ceph osd crush move $zone root=default
|
||||
done
|
||||
|
||||
ceph osd crush add-bucket node-2 host
|
||||
ceph osd crush add-bucket node-3 host
|
||||
ceph osd crush add-bucket node-4 host
|
||||
ceph osd crush add-bucket node-5 host
|
||||
|
||||
ceph osd crush move node-2 zone=iris
|
||||
ceph osd crush move node-3 zone=iris
|
||||
ceph osd crush move node-4 zone=pze
|
||||
ceph osd crush move node-5 zone=pze
|
||||
|
||||
ceph osd crush move osd.0 host=node-2
|
||||
ceph osd crush move osd.1 host=node-3
|
||||
ceph osd crush move osd.2 host=node-4
|
||||
ceph osd crush move osd.3 host=node-5
|
||||
|
||||
ceph mon set_location a zone=iris host=node-2
|
||||
ceph mon set_location b zone=iris host=node-3
|
||||
ceph mon set_location c zone=pze host=node-4
|
||||
ceph mon set_location d zone=pze host=node-5
|
||||
|
||||
hostname=$(hostname -s)
|
||||
ceph osd crush remove $hostname || return 1
|
||||
ceph osd getcrushmap > crushmap || return 1
|
||||
crushtool --decompile crushmap > crushmap.txt || return 1
|
||||
sed 's/^# end crush map$//' crushmap.txt > crushmap_modified.txt || return 1
|
||||
cat >> crushmap_modified.txt << EOF
|
||||
rule stretch_rule {
|
||||
id 1
|
||||
type replicated
|
||||
min_size 1
|
||||
max_size 10
|
||||
step take iris
|
||||
step chooseleaf firstn 2 type host
|
||||
step emit
|
||||
step take pze
|
||||
step chooseleaf firstn 2 type host
|
||||
step emit
|
||||
}
|
||||
# end crush map
|
||||
EOF
|
||||
|
||||
crushtool --compile crushmap_modified.txt -o crushmap.bin || return 1
|
||||
ceph osd setcrushmap -i crushmap.bin || return 1
|
||||
local stretched_poolname=stretched_rbdpool
|
||||
ceph osd pool create $stretched_poolname 32 32 stretch_rule || return 1
|
||||
ceph osd pool set $stretched_poolname size 4 || return 1
|
||||
|
||||
ceph mon set_location e zone=arbiter host=node-1 || return 1
|
||||
ceph mon enable_stretch_mode e stretch_rule zone || return 1 # Enter strech mode
|
||||
|
||||
# reweight to a more round decimal.
|
||||
ceph osd crush reweight osd.0 $weight
|
||||
ceph osd crush reweight osd.1 $weight
|
||||
ceph osd crush reweight osd.2 $weight
|
||||
ceph osd crush reweight osd.3 $weight
|
||||
|
||||
# Firstly, we test for stretch mode buckets != 2
|
||||
ceph osd crush add-bucket sham zone || return 1
|
||||
ceph osd crush move sham root=default || return 1
|
||||
wait_for_health "INCORRECT_NUM_BUCKETS_STRETCH_MODE" || return 1
|
||||
|
||||
ceph osd crush rm sham # clear the health warn
|
||||
wait_for_health_gone "INCORRECT_NUM_BUCKETS_STRETCH_MODE" || return 1
|
||||
|
||||
# Next, we test for uneven weights across buckets
|
||||
|
||||
ceph osd crush reweight osd.0 0.07000
|
||||
|
||||
wait_for_health "UNEVEN_WEIGHTS_STRETCH_MODE" || return 1
|
||||
|
||||
ceph osd crush reweight osd.0 $weight # clear the health warn
|
||||
|
||||
wait_for_health_gone "UNEVEN_WEIGHTS_STRETCH_MODE" || return 1
|
||||
|
||||
teardown $dir || return 1
|
||||
}
|
||||
main mon-stretched-cluster-uneven-weight "$@"
|
@ -19,6 +19,7 @@ overrides:
|
||||
- MDS_READ_ONLY
|
||||
- force file system read-only
|
||||
- with standby daemon mds
|
||||
- MDS abort because newly corrupt dentry
|
||||
tasks:
|
||||
- cephfs_test_runner:
|
||||
modules:
|
||||
|
@ -0,0 +1,6 @@
|
||||
# Lengthen the timeout for thrashed MDS
|
||||
overrides:
|
||||
ceph:
|
||||
conf:
|
||||
client:
|
||||
client_shutdown_timeout: 120
|
@ -0,0 +1,6 @@
|
||||
# Lengthen the timeout for thrashed MDS
|
||||
overrides:
|
||||
ceph:
|
||||
conf:
|
||||
client:
|
||||
client_shutdown_timeout: 120
|
@ -0,0 +1,13 @@
|
||||
tasks:
|
||||
- check-counter:
|
||||
counters:
|
||||
mgr:
|
||||
- name: "finisher-volumes.complete_latency.avgcount"
|
||||
min: 4
|
||||
- name: "finisher-volumes.queue_len"
|
||||
expected_val: 0
|
||||
|
||||
- cephfs_test_runner:
|
||||
fail_on_skip: false
|
||||
modules:
|
||||
- tasks.cephfs.test_volumes.TestPerModuleFinsherThread
|
0
ceph/qa/suites/krbd/singleton-msgr-failures/%
Normal file
0
ceph/qa/suites/krbd/singleton-msgr-failures/%
Normal file
@ -0,0 +1 @@
|
||||
.qa/objectstore/bluestore-bitmap.yaml
|
7
ceph/qa/suites/krbd/singleton-msgr-failures/conf.yaml
Normal file
7
ceph/qa/suites/krbd/singleton-msgr-failures/conf.yaml
Normal file
@ -0,0 +1,7 @@
|
||||
overrides:
|
||||
ceph:
|
||||
conf:
|
||||
global:
|
||||
ms die on skipped message: false
|
||||
client:
|
||||
rbd default features: 37
|
1
ceph/qa/suites/krbd/singleton-msgr-failures/ms_mode$/.qa
Symbolic link
1
ceph/qa/suites/krbd/singleton-msgr-failures/ms_mode$/.qa
Symbolic link
@ -0,0 +1 @@
|
||||
../.qa/
|
@ -0,0 +1,5 @@
|
||||
overrides:
|
||||
ceph:
|
||||
conf:
|
||||
client:
|
||||
rbd default map options: ms_mode=crc,rxbounce
|
@ -0,0 +1,5 @@
|
||||
overrides:
|
||||
ceph:
|
||||
conf:
|
||||
client:
|
||||
rbd default map options: ms_mode=crc
|
@ -0,0 +1,5 @@
|
||||
overrides:
|
||||
ceph:
|
||||
conf:
|
||||
client:
|
||||
rbd default map options: ms_mode=legacy,rxbounce
|
@ -0,0 +1,5 @@
|
||||
overrides:
|
||||
ceph:
|
||||
conf:
|
||||
client:
|
||||
rbd default map options: ms_mode=legacy
|
@ -0,0 +1,5 @@
|
||||
overrides:
|
||||
ceph:
|
||||
conf:
|
||||
client:
|
||||
rbd default map options: ms_mode=secure
|
1
ceph/qa/suites/krbd/singleton-msgr-failures/msgr-failures/.qa
Symbolic link
1
ceph/qa/suites/krbd/singleton-msgr-failures/msgr-failures/.qa
Symbolic link
@ -0,0 +1 @@
|
||||
../.qa/
|
1
ceph/qa/suites/krbd/singleton-msgr-failures/tasks/.qa
Symbolic link
1
ceph/qa/suites/krbd/singleton-msgr-failures/tasks/.qa
Symbolic link
@ -0,0 +1 @@
|
||||
../.qa/
|
@ -2,6 +2,7 @@ overrides:
|
||||
ceph:
|
||||
conf:
|
||||
global:
|
||||
mon warn on pool no app: false
|
||||
ms die on skipped message: false
|
||||
client:
|
||||
rbd default features: 37
|
||||
|
19
ceph/qa/suites/krbd/singleton/tasks/krbd_watch_errors.yaml
Normal file
19
ceph/qa/suites/krbd/singleton/tasks/krbd_watch_errors.yaml
Normal file
@ -0,0 +1,19 @@
|
||||
overrides:
|
||||
ceph:
|
||||
conf:
|
||||
global:
|
||||
osd pool default size: 1
|
||||
osd:
|
||||
osd shutdown pgref assert: true
|
||||
roles:
|
||||
- [mon.a, mgr.x, osd.0, client.0]
|
||||
|
||||
tasks:
|
||||
- install:
|
||||
extra_system_packages:
|
||||
- fio
|
||||
- ceph:
|
||||
- workunit:
|
||||
clients:
|
||||
all:
|
||||
- rbd/krbd_watch_errors.sh
|
@ -1,3 +1,28 @@
|
||||
overrides:
|
||||
ceph:
|
||||
log-ignorelist:
|
||||
- \(HOST_IN_MAINTENANCE\)
|
||||
- \(OSD_DOWN\)
|
||||
- \(MON_DOWN\)
|
||||
- down
|
||||
- overall HEALTH_
|
||||
- \(CEPHADM_STRAY_DAEMON\)
|
||||
- stray daemon
|
||||
- \(FS_DEGRADED\)
|
||||
- \(MDS_FAILED\)
|
||||
- \(MDS_DEGRADED\)
|
||||
- \(FS_WITH_FAILED_MDS\)
|
||||
- \(MDS_DAMAGE\)
|
||||
- \(MDS_ALL_DOWN\)
|
||||
- \(MDS_UP_LESS_THAN_MAX\)
|
||||
- \(FS_INLINE_DATA_DEPRECATED\)
|
||||
- \(PG_DEGRADED\)
|
||||
- Degraded data redundancy
|
||||
- \(PG_
|
||||
- acting
|
||||
- MDS_INSUFFICIENT_STANDBY
|
||||
- deprecated feature inline_data
|
||||
- compat changed unexpectedly
|
||||
roles:
|
||||
# 3 osd roles on host.a is required for cephadm task. It checks if the cluster is healthy.
|
||||
# More daemons will be deployed on both hosts in e2e tests.
|
||||
|
@ -24,6 +24,21 @@ openstack:
|
||||
size: 10 # GB
|
||||
overrides:
|
||||
ceph:
|
||||
log-ignorelist:
|
||||
- slow requests
|
||||
- \(PG_
|
||||
- PG_
|
||||
- \(CEPHADM_STRAY_DAEMON\)
|
||||
- slow request
|
||||
- \(MDS_
|
||||
- MDS_
|
||||
- osds down
|
||||
- OSD_
|
||||
- \(OSD_
|
||||
- client
|
||||
- FS_
|
||||
- \(FS_
|
||||
- degraded
|
||||
conf:
|
||||
osd:
|
||||
osd shutdown pgref assert: true
|
||||
|
@ -1,3 +1,10 @@
|
||||
overrides:
|
||||
ceph:
|
||||
log-ignorelist:
|
||||
- \(MON_DOWN\)
|
||||
- \(OSD_DOWN\)
|
||||
- \(PG_
|
||||
- but it is still running
|
||||
tasks:
|
||||
- cephadm.shell:
|
||||
host.a:
|
||||
|
@ -1,3 +1,10 @@
|
||||
overrides:
|
||||
ceph:
|
||||
log-ignorelist:
|
||||
- \(MON_DOWN\)
|
||||
- \(OSD_DOWN\)
|
||||
- \(PG_
|
||||
- but it is still running
|
||||
tasks:
|
||||
- cephadm.shell:
|
||||
host.a:
|
||||
|
@ -1,3 +1,10 @@
|
||||
overrides:
|
||||
ceph:
|
||||
log-ignorelist:
|
||||
- \(MON_DOWN\)
|
||||
- \(OSD_DOWN\)
|
||||
- \(PG_
|
||||
- but it is still running
|
||||
tasks:
|
||||
- cephadm.shell:
|
||||
host.a:
|
||||
|
@ -1,3 +1,10 @@
|
||||
overrides:
|
||||
ceph:
|
||||
log-ignorelist:
|
||||
- \(MON_DOWN\)
|
||||
- \(OSD_DOWN\)
|
||||
- \(PG_
|
||||
- but it is still running
|
||||
tasks:
|
||||
- cephadm.shell:
|
||||
host.a:
|
||||
|
@ -1,3 +1,11 @@
|
||||
overrides:
|
||||
ceph:
|
||||
log-ignorelist:
|
||||
- \(MON_DOWN\)
|
||||
- \(OSD_DOWN\)
|
||||
- \(PG_
|
||||
- but it is still running
|
||||
- \(CEPHADM_STRAY_DAEMON\)
|
||||
tasks:
|
||||
- cephadm.shell:
|
||||
host.a:
|
||||
|
@ -1,3 +1,11 @@
|
||||
overrides:
|
||||
ceph:
|
||||
log-ignorelist:
|
||||
- \(MON_DOWN\)
|
||||
- \(PG_AVAILABILITY\)
|
||||
- mon down
|
||||
- mons down
|
||||
- out of quorum
|
||||
tasks:
|
||||
- cephadm:
|
||||
conf:
|
||||
|
@ -3,6 +3,23 @@ overrides:
|
||||
log-ignorelist:
|
||||
- but it is still running
|
||||
- objects unfound and apparently lost
|
||||
- \(MON_DOWN\)
|
||||
- \(OSDMAP_FLAGS\)
|
||||
- flag\(s\) set
|
||||
- \(CACHE_POOL_NO_HIT_SET\)
|
||||
- \(CACHE_
|
||||
- \(PG_
|
||||
- \(OSD_
|
||||
- \(POOL_
|
||||
- \(CEPHADM_STRAY_DAEMON\)
|
||||
- PG_
|
||||
- CACHE_
|
||||
- degraded
|
||||
- backfill
|
||||
- mons down
|
||||
- OSD_
|
||||
- is down
|
||||
- acting
|
||||
conf:
|
||||
osd:
|
||||
osd debug reject backfill probability: .3
|
||||
|
@ -1,3 +1,14 @@
|
||||
overrides:
|
||||
ceph:
|
||||
log-ignorelist:
|
||||
- \(MON_DOWN\)
|
||||
- \(PG_
|
||||
- mons down
|
||||
- pg inactive
|
||||
- out of quorum
|
||||
- \(OSD_
|
||||
- osds down
|
||||
- osd down
|
||||
tasks:
|
||||
- cephadm.shell:
|
||||
env: [sha1]
|
||||
|
@ -1,3 +1,9 @@
|
||||
overrides:
|
||||
ceph:
|
||||
log-ignorelist:
|
||||
- Replacing daemon mds
|
||||
- FS_DEGRADED
|
||||
- \(CEPHADM_STRAY_DAEMON\)
|
||||
roles:
|
||||
- - host.a
|
||||
- osd.0
|
||||
|
@ -1,3 +1,10 @@
|
||||
overrides:
|
||||
ceph:
|
||||
log-ignorelist:
|
||||
- \(MON_DOWN\)
|
||||
- \(OSD_DOWN\)
|
||||
- \(CEPHADM_PAUSED\)
|
||||
- mons down
|
||||
roles:
|
||||
- - host.a
|
||||
- osd.0
|
||||
|
@ -1,3 +1,10 @@
|
||||
overrides:
|
||||
ceph:
|
||||
log-ignorelist:
|
||||
- \(MON_DOWN\)
|
||||
- mons down
|
||||
- \(MGR_DOWN\)
|
||||
- out of quorum
|
||||
roles:
|
||||
- - host.a
|
||||
- osd.0
|
||||
|
@ -11,6 +11,15 @@ overrides:
|
||||
- \(POOL_APP_NOT_ENABLED\)
|
||||
- \(PG_AVAILABILITY\)
|
||||
- \(PG_DEGRADED\)
|
||||
- \(MON_DOWN\)
|
||||
- \(CEPHADM_STRAY_DAEMON\)
|
||||
- missing hit_sets
|
||||
- do not have an application enabled
|
||||
- application not enabled on pool
|
||||
- pool application
|
||||
- mons down
|
||||
- out of quorum
|
||||
- needs hit_set_type to be set but it is not
|
||||
conf:
|
||||
client:
|
||||
debug ms: 1
|
||||
|
@ -2,6 +2,7 @@ overrides:
|
||||
ceph:
|
||||
log-ignorelist:
|
||||
- \(PG_AVAILABILITY\)
|
||||
- \(POOL_APP_NOT_ENABLED\)
|
||||
conf:
|
||||
osd:
|
||||
osd_class_load_list: "*"
|
||||
|
@ -8,6 +8,13 @@ overrides:
|
||||
- \(OSD_
|
||||
- \(OBJECT_
|
||||
- \(POOL_APP_NOT_ENABLED\)
|
||||
- \(MON_DOWN\)
|
||||
- mons down
|
||||
- application not enabled on pool
|
||||
- do not have an application enabled
|
||||
- pool application
|
||||
- out of quorum
|
||||
- needs hit_set_type to be set but it is not
|
||||
tasks:
|
||||
- workunit:
|
||||
clients:
|
||||
|
@ -0,0 +1,43 @@
|
||||
tasks:
|
||||
- install:
|
||||
- ceph:
|
||||
wait-for-scrub: false
|
||||
- check-counter:
|
||||
counters:
|
||||
mgr:
|
||||
- name: "finisher-balancer.complete_latency.avgcount"
|
||||
min: 1
|
||||
- name: "finisher-balancer.queue_len"
|
||||
expected_val: 0
|
||||
- name: "finisher-crash.complete_latency.avgcount"
|
||||
min: 2
|
||||
- name: "finisher-crash.queue_len"
|
||||
expected_val: 0
|
||||
- name: "finisher-devicehealth.complete_latency.avgcount"
|
||||
min: 1
|
||||
- name: "finisher-devicehealth.queue_len"
|
||||
expected_val: 0
|
||||
- name: "finisher-iostat.complete_latency.avgcount"
|
||||
min: 1
|
||||
- name: "finisher-iostat.queue_len"
|
||||
expected_val: 0
|
||||
- name: "finisher-pg_autoscaler.complete_latency.avgcount"
|
||||
min: 1
|
||||
- name: "finisher-pg_autoscaler.queue_len"
|
||||
expected_val: 0
|
||||
- name: "finisher-progress.complete_latency.avgcount"
|
||||
min: 2
|
||||
- name: "finisher-progress.queue_len"
|
||||
expected_val: 0
|
||||
- name: "finisher-status.complete_latency.avgcount"
|
||||
min: 2
|
||||
- name: "finisher-status.queue_len"
|
||||
expected_val: 0
|
||||
- name: "finisher-telemetry.complete_latency.avgcount"
|
||||
min: 1
|
||||
- name: "finisher-telemetry.queue_len"
|
||||
expected_val: 0
|
||||
- workunit:
|
||||
clients:
|
||||
client.0:
|
||||
- mgr/test_per_module_finisher.sh
|
@ -13,4 +13,4 @@ tasks:
|
||||
- workunit:
|
||||
clients:
|
||||
client.0:
|
||||
- mgr
|
||||
- mgr/test_localpool.sh
|
||||
|
18
ceph/qa/suites/rados/standalone/workloads/mon-stretch.yaml
Normal file
18
ceph/qa/suites/rados/standalone/workloads/mon-stretch.yaml
Normal file
@ -0,0 +1,18 @@
|
||||
roles:
|
||||
- - mon.a
|
||||
- mgr.x
|
||||
- osd.0
|
||||
- osd.1
|
||||
- osd.2
|
||||
- client.0
|
||||
openstack:
|
||||
- volumes: # attached to each instance
|
||||
count: 3
|
||||
size: 10 # GB
|
||||
tasks:
|
||||
- install:
|
||||
- workunit:
|
||||
basedir: qa/standalone
|
||||
clients:
|
||||
all:
|
||||
- mon-stretch
|
@ -4,6 +4,8 @@ overrides:
|
||||
osd:
|
||||
osd_class_load_list: "*"
|
||||
osd_class_default_list: "*"
|
||||
log-ignorelist:
|
||||
- \(POOL_APP_NOT_ENABLED\)
|
||||
tasks:
|
||||
- workunit:
|
||||
clients:
|
||||
|
@ -0,0 +1,13 @@
|
||||
overrides:
|
||||
ceph:
|
||||
conf:
|
||||
mgr:
|
||||
debug rbd: 20
|
||||
tasks:
|
||||
- install:
|
||||
extra_system_packages:
|
||||
- fio
|
||||
- workunit:
|
||||
clients:
|
||||
client.0:
|
||||
- rbd/rbd_support_module_recovery.sh
|
5
ceph/qa/suites/rgw/verify/tasks/bucket-check.yaml
Normal file
5
ceph/qa/suites/rgw/verify/tasks/bucket-check.yaml
Normal file
@ -0,0 +1,5 @@
|
||||
tasks:
|
||||
- workunit:
|
||||
clients:
|
||||
client.0:
|
||||
- rgw/run-bucket-check.sh
|
@ -6,7 +6,7 @@ workload:
|
||||
- sequential:
|
||||
- ragweed:
|
||||
client.1:
|
||||
default-branch: ceph-pacific
|
||||
default-branch: ceph-nautilus
|
||||
rgw_server: client.1
|
||||
stages: prepare
|
||||
- print: "**** done rgw ragweed prepare 2-workload"
|
||||
|
@ -5,7 +5,7 @@ rgw-final-workload:
|
||||
full_sequential:
|
||||
- ragweed:
|
||||
client.1:
|
||||
default-branch: ceph-pacific
|
||||
default-branch: ceph-nautilus
|
||||
rgw_server: client.1
|
||||
stages: check
|
||||
- print: "**** done ragweed check 4-final-workload"
|
||||
|
@ -5,7 +5,7 @@ rgw-final-workload:
|
||||
full_sequential:
|
||||
- ragweed:
|
||||
client.1:
|
||||
default-branch: ceph-pacific
|
||||
default-branch: ceph-octopus
|
||||
rgw_server: client.1
|
||||
stages: check
|
||||
- print: "**** done ragweed check 4-final-workload"
|
||||
|
@ -123,7 +123,7 @@ workload_pacific:
|
||||
- rados/test.sh
|
||||
- cls
|
||||
env:
|
||||
CLS_RBD_GTEST_FILTER: '*:-TestClsRbd.snapshots_namespaces'
|
||||
CLS_RBD_GTEST_FILTER: '*:-TestClsRbd.mirror_snapshot'
|
||||
- print: "**** done rados/test.sh & cls workload_pacific"
|
||||
- sequential:
|
||||
- rgw: [client.0]
|
||||
|
@ -7,4 +7,6 @@ stress-tasks:
|
||||
clients:
|
||||
client.0:
|
||||
- cls/test_cls_rbd.sh
|
||||
env:
|
||||
CLS_RBD_GTEST_FILTER: '*:-TestClsRbd.mirror_snapshot'
|
||||
- print: "**** done cls/test_cls_rbd.sh 4-workload"
|
||||
|
@ -3,7 +3,7 @@ meta:
|
||||
librbd python api tests
|
||||
tasks:
|
||||
- workunit:
|
||||
tag: v16.2.7
|
||||
branch: pacific
|
||||
clients:
|
||||
client.0:
|
||||
- rbd/test_librbd_python.sh
|
||||
|
@ -232,6 +232,7 @@ class OSDThrasher(Thrasher):
|
||||
self.chance_thrash_pg_upmap_items = self.config.get('chance_thrash_pg_upmap', 1.0)
|
||||
self.random_eio = self.config.get('random_eio')
|
||||
self.chance_force_recovery = self.config.get('chance_force_recovery', 0.3)
|
||||
self.chance_reset_purged_snaps_last = self.config.get('chance_reset_purged_snaps_last', 0.3)
|
||||
|
||||
num_osds = self.in_osds + self.out_osds
|
||||
self.max_pgs = self.config.get("max_pgs_per_pool_osd", 1200) * len(num_osds)
|
||||
@ -798,6 +799,19 @@ class OSDThrasher(Thrasher):
|
||||
else:
|
||||
self.cancel_force_recovery()
|
||||
|
||||
def reset_purged_snaps_last(self):
|
||||
"""
|
||||
Run reset_purged_snaps_last
|
||||
"""
|
||||
self.log('reset_purged_snaps_last')
|
||||
for osd in self.in_osds:
|
||||
try:
|
||||
self.ceph_manager.raw_cluster_cmd(
|
||||
'tell', "osd.%s" % (str(osd)),
|
||||
'reset_purged_snaps_last')
|
||||
except CommandFailedError:
|
||||
self.log('Failed to reset_purged_snaps_last, ignoring')
|
||||
|
||||
def all_up(self):
|
||||
"""
|
||||
Make sure all osds are up and not out.
|
||||
@ -1248,6 +1262,8 @@ class OSDThrasher(Thrasher):
|
||||
actions.append((self.thrash_pg_upmap_items, self.chance_thrash_pg_upmap_items,))
|
||||
if self.chance_force_recovery > 0:
|
||||
actions.append((self.force_cancel_recovery, self.chance_force_recovery))
|
||||
if self.chance_reset_purged_snaps_last > 0:
|
||||
actions.append((self.reset_purged_snaps_last, self.chance_reset_purged_snaps_last))
|
||||
|
||||
for key in ['heartbeat_inject_failure', 'filestore_inject_stall']:
|
||||
for scenario in [
|
||||
|
@ -2,6 +2,8 @@
|
||||
# make logging friendly to teuthology
|
||||
log_to_file = true
|
||||
log_to_stderr = false
|
||||
log to journald = false
|
||||
mon cluster log to file = true
|
||||
mon cluster log file level = debug
|
||||
|
||||
mon clock drift allowed = 1.000
|
||||
|
@ -811,7 +811,7 @@ class CephFSMount(object):
|
||||
))
|
||||
p.wait()
|
||||
|
||||
def open_background(self, basename="background_file", write=True):
|
||||
def open_background(self, basename="background_file", write=True, content="content"):
|
||||
"""
|
||||
Open a file for writing, then block such that the client
|
||||
will hold a capability.
|
||||
@ -828,12 +828,11 @@ class CephFSMount(object):
|
||||
import time
|
||||
|
||||
with open("{path}", 'w') as f:
|
||||
f.write('content')
|
||||
f.write("{content}")
|
||||
f.flush()
|
||||
f.write('content2')
|
||||
while True:
|
||||
time.sleep(1)
|
||||
""").format(path=path)
|
||||
""").format(path=path, content=content)
|
||||
else:
|
||||
pyscript = dedent("""
|
||||
import time
|
||||
@ -849,7 +848,10 @@ class CephFSMount(object):
|
||||
# This wait would not be sufficient if the file had already
|
||||
# existed, but it's simple and in practice users of open_background
|
||||
# are not using it on existing files.
|
||||
self.wait_for_visible(basename)
|
||||
if write:
|
||||
self.wait_for_visible(basename, size=len(content))
|
||||
else:
|
||||
self.wait_for_visible(basename)
|
||||
|
||||
return rproc
|
||||
|
||||
@ -887,19 +889,27 @@ class CephFSMount(object):
|
||||
if nr_links == 2:
|
||||
return
|
||||
|
||||
def wait_for_visible(self, basename="background_file", timeout=30):
|
||||
def wait_for_visible(self, basename="background_file", size=None, timeout=30):
|
||||
i = 0
|
||||
args = ['stat']
|
||||
if size is not None:
|
||||
args += ['--printf=%s']
|
||||
args += [os.path.join(self.hostfs_mntpt, basename)]
|
||||
while i < timeout:
|
||||
r = self.client_remote.run(args=[
|
||||
'stat', os.path.join(self.hostfs_mntpt, basename)
|
||||
], check_status=False)
|
||||
if r.exitstatus == 0:
|
||||
log.debug("File {0} became visible from {1} after {2}s".format(
|
||||
basename, self.client_id, i))
|
||||
return
|
||||
else:
|
||||
time.sleep(1)
|
||||
i += 1
|
||||
p = self.client_remote.run(args=args, stdout=StringIO(), check_status=False)
|
||||
if p.exitstatus == 0:
|
||||
if size is not None:
|
||||
s = p.stdout.getvalue().strip()
|
||||
if int(s) == size:
|
||||
log.info(f"File {basename} became visible with size {size} from {self.client_id} after {i}s")
|
||||
return
|
||||
else:
|
||||
log.error(f"File {basename} became visible but with size {int(s)} not {size}")
|
||||
else:
|
||||
log.info(f"File {basename} became visible from {self.client_id} after {i}s")
|
||||
return
|
||||
time.sleep(1)
|
||||
i += 1
|
||||
|
||||
raise RuntimeError("Timed out after {0}s waiting for {1} to become visible from {2}".format(
|
||||
i, basename, self.client_id))
|
||||
|
@ -1,6 +1,8 @@
|
||||
"""
|
||||
Before running this testsuite, add path to cephfs-shell module to $PATH and
|
||||
export $PATH.
|
||||
NOTE: For running this tests locally (using vstart_runner.py), export the
|
||||
path to src/tools/cephfs/shell/cephfs-shell module to $PATH. Running
|
||||
"export PATH=$PATH:$(cd ../src/tools/cephfs/shell && pwd)" from the build dir
|
||||
will update the environment without hassles of typing the path correctly.
|
||||
"""
|
||||
from io import StringIO
|
||||
from os import path
|
||||
|
@ -9,7 +9,9 @@ from textwrap import dedent
|
||||
from tasks.ceph_test_case import TestTimeoutError
|
||||
from tasks.cephfs.cephfs_test_case import CephFSTestCase, needs_trimming
|
||||
from tasks.cephfs.fuse_mount import FuseMount
|
||||
from teuthology.exceptions import CommandFailedError
|
||||
import os
|
||||
from io import StringIO
|
||||
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
@ -157,29 +159,49 @@ class TestClientLimits(CephFSTestCase):
|
||||
a fraction of second (0.5) by default when throttling condition is met.
|
||||
"""
|
||||
|
||||
max_caps_per_client = 500
|
||||
cap_acquisition_throttle = 250
|
||||
subdir_count = 4
|
||||
files_per_dir = 25
|
||||
|
||||
self.config_set('mds', 'mds_max_caps_per_client', max_caps_per_client)
|
||||
self.config_set('mds', 'mds_session_cap_acquisition_throttle', cap_acquisition_throttle)
|
||||
# throttle in a way so that two dir reads are already hitting it.
|
||||
throttle_value = (files_per_dir * 3) // 2
|
||||
|
||||
# Create 1500 files split across 6 directories, 250 each.
|
||||
for i in range(1, 7):
|
||||
self.mount_a.create_n_files("dir{0}/file".format(i), cap_acquisition_throttle, sync=True)
|
||||
# activate throttling logic by setting max per client to a low value
|
||||
self.config_set('mds', 'mds_max_caps_per_client', 1)
|
||||
self.config_set('mds', 'mds_session_cap_acquisition_throttle', throttle_value)
|
||||
|
||||
# Create files split across {subdir_count} directories, {per_dir_count} in each dir
|
||||
for i in range(1, subdir_count+1):
|
||||
self.mount_a.create_n_files("dir{0}/file".format(i), files_per_dir, sync=True)
|
||||
|
||||
mount_a_client_id = self.mount_a.get_global_id()
|
||||
|
||||
# recursive readdir
|
||||
self.mount_a.run_shell_payload("find | wc")
|
||||
|
||||
# validate cap_acquisition decay counter after readdir to exceed throttle count i.e 250
|
||||
cap_acquisition_value = self.get_session(mount_a_client_id)['cap_acquisition']['value']
|
||||
self.assertGreaterEqual(cap_acquisition_value, cap_acquisition_throttle)
|
||||
# recursive readdir. macOs wants an explicit directory for `find`.
|
||||
proc = self.mount_a.run_shell_payload("find . | wc", stderr=StringIO())
|
||||
# return code may be None if the command got interrupted
|
||||
self.assertTrue(proc.returncode is None or proc.returncode == 0, proc.stderr.getvalue())
|
||||
|
||||
# validate the throttle condition to be hit atleast once
|
||||
cap_acquisition_throttle_hit_count = self.perf_dump()['mds_server']['cap_acquisition_throttle']
|
||||
self.assertGreaterEqual(cap_acquisition_throttle_hit_count, 1)
|
||||
|
||||
# validate cap_acquisition decay counter after readdir to NOT exceed the throttle value
|
||||
# plus one batch that could have been taken immediately before querying
|
||||
# assuming the batch is equal to the per dir file count.
|
||||
cap_acquisition_value = self.get_session(mount_a_client_id)['cap_acquisition']['value']
|
||||
self.assertLessEqual(cap_acquisition_value, files_per_dir + throttle_value)
|
||||
|
||||
# make sure that the throttle was reported in the events
|
||||
def historic_ops_have_event(expected_event):
|
||||
ops_dump = self.fs.rank_tell(['dump_historic_ops'])
|
||||
# reverse the events and the ops assuming that later ops would be throttled
|
||||
for op in reversed(ops_dump['ops']):
|
||||
for ev in reversed(op.get('type_data', {}).get('events', [])):
|
||||
if ev['event'] == expected_event:
|
||||
return True
|
||||
return False
|
||||
|
||||
self.assertTrue(historic_ops_have_event('cap_acquisition_throttle'))
|
||||
|
||||
def test_client_release_bug(self):
|
||||
"""
|
||||
When a client has a bug (which we will simulate) preventing it from releasing caps,
|
||||
@ -219,6 +241,55 @@ class TestClientLimits(CephFSTestCase):
|
||||
self.fs.mds_asok(['session', 'evict', "%s" % mount_a_client_id])
|
||||
rproc.wait()
|
||||
|
||||
def test_client_blocklisted_oldest_tid(self):
|
||||
"""
|
||||
that a client is blocklisted when its encoded session metadata exceeds the
|
||||
configured threshold (due to ever growing `completed_requests` caused due
|
||||
to an unidentified bug (in the client or the MDS)).
|
||||
"""
|
||||
|
||||
# num of requests client issues
|
||||
max_requests = 10000
|
||||
|
||||
# The debug hook to inject the failure only exists in the fuse client
|
||||
if not isinstance(self.mount_a, FuseMount):
|
||||
self.skipTest("Require FUSE client to inject client release failure")
|
||||
|
||||
self.config_set('client', 'client inject fixed oldest tid', 'true')
|
||||
self.mount_a.teardown()
|
||||
self.mount_a.mount_wait()
|
||||
|
||||
self.config_set('mds', 'mds_max_completed_requests', max_requests);
|
||||
|
||||
# Create lots of files
|
||||
self.mount_a.create_n_files("testdir/file1", max_requests + 100)
|
||||
|
||||
# Create a few files synchronously. This makes sure previous requests are completed
|
||||
self.mount_a.create_n_files("testdir/file2", 5, True)
|
||||
|
||||
# Wait for the health warnings. Assume mds can handle 10 request per second at least
|
||||
self.wait_for_health("MDS_CLIENT_OLDEST_TID", max_requests // 10, check_in_detail=str(self.mount_a.client_id))
|
||||
|
||||
# set the threshold low so that it has a high probability of
|
||||
# hitting.
|
||||
self.config_set('mds', 'mds_session_metadata_threshold', 5000);
|
||||
|
||||
# Create lot many files synchronously. This would hit the session metadata threshold
|
||||
# causing the client to get blocklisted.
|
||||
with self.assertRaises(CommandFailedError):
|
||||
self.mount_a.create_n_files("testdir/file2", 100000, True)
|
||||
|
||||
self.mds_cluster.is_addr_blocklisted(self.mount_a.get_global_addr())
|
||||
# the mds should bump up the relevant perf counter
|
||||
pd = self.perf_dump()
|
||||
self.assertGreater(pd['mds_sessions']['mdthresh_evicted'], 0)
|
||||
|
||||
# reset the config
|
||||
self.config_set('client', 'client inject fixed oldest tid', 'false')
|
||||
|
||||
self.mount_a.kill_cleanup()
|
||||
self.mount_a.mount_wait()
|
||||
|
||||
def test_client_oldest_tid(self):
|
||||
"""
|
||||
When a client does not advance its oldest tid, the MDS should notice that
|
||||
|
@ -10,8 +10,10 @@ from textwrap import dedent
|
||||
import time
|
||||
import distutils.version as version
|
||||
import re
|
||||
import string
|
||||
import os
|
||||
|
||||
from teuthology import contextutil
|
||||
from teuthology.orchestra import run
|
||||
from teuthology.orchestra.run import CommandFailedError
|
||||
from tasks.cephfs.fuse_mount import FuseMount
|
||||
@ -221,8 +223,10 @@ class TestClientRecovery(CephFSTestCase):
|
||||
# Capability release from stale session
|
||||
# =====================================
|
||||
if write:
|
||||
cap_holder = self.mount_a.open_background()
|
||||
content = ''.join(random.choices(string.ascii_uppercase + string.digits, k=16))
|
||||
cap_holder = self.mount_a.open_background(content=content)
|
||||
else:
|
||||
content = ''
|
||||
self.mount_a.run_shell(["touch", "background_file"])
|
||||
self.mount_a.umount_wait()
|
||||
self.mount_a.mount_wait()
|
||||
@ -233,7 +237,7 @@ class TestClientRecovery(CephFSTestCase):
|
||||
|
||||
# Wait for the file to be visible from another client, indicating
|
||||
# that mount_a has completed its network ops
|
||||
self.mount_b.wait_for_visible()
|
||||
self.mount_b.wait_for_visible(size=len(content))
|
||||
|
||||
# Simulate client death
|
||||
self.mount_a.suspend_netns()
|
||||
@ -264,11 +268,9 @@ class TestClientRecovery(CephFSTestCase):
|
||||
"Capability handover took {0}, expected approx {1}".format(
|
||||
cap_waited, session_timeout
|
||||
))
|
||||
|
||||
self.mount_a._kill_background(cap_holder)
|
||||
finally:
|
||||
# teardown() doesn't quite handle this case cleanly, so help it out
|
||||
self.mount_a.resume_netns()
|
||||
self.mount_a.resume_netns() # allow the mount to recover otherwise background proc is unkillable
|
||||
self.mount_a._kill_background(cap_holder)
|
||||
|
||||
def test_stale_read_caps(self):
|
||||
self._test_stale_caps(False)
|
||||
@ -319,9 +321,9 @@ class TestClientRecovery(CephFSTestCase):
|
||||
cap_waited, session_timeout / 2.0
|
||||
))
|
||||
|
||||
self.mount_a._kill_background(cap_holder)
|
||||
finally:
|
||||
self.mount_a.resume_netns()
|
||||
self.mount_a.resume_netns() # allow the mount to recover otherwise background proc is unkillable
|
||||
self.mount_a._kill_background(cap_holder)
|
||||
|
||||
def test_trim_caps(self):
|
||||
# Trim capability when reconnecting MDS
|
||||
@ -387,7 +389,6 @@ class TestClientRecovery(CephFSTestCase):
|
||||
|
||||
self.mount_b.check_filelock(do_flock=flockable)
|
||||
|
||||
# Tear down the background process
|
||||
self.mount_a._kill_background(lock_holder)
|
||||
|
||||
def test_filelock_eviction(self):
|
||||
@ -416,7 +417,6 @@ class TestClientRecovery(CephFSTestCase):
|
||||
# succeed
|
||||
self.wait_until_true(lambda: lock_taker.finished, timeout=10)
|
||||
finally:
|
||||
# Tear down the background process
|
||||
self.mount_a._kill_background(lock_holder)
|
||||
|
||||
# teardown() doesn't quite handle this case cleanly, so help it out
|
||||
@ -751,24 +751,27 @@ class TestClientOnLaggyOSD(CephFSTestCase):
|
||||
# it takes time to have laggy clients entries in cluster log,
|
||||
# wait for 6 minutes to see if it is visible, finally restart
|
||||
# the client
|
||||
tries = 6
|
||||
while True:
|
||||
try:
|
||||
with self.assert_cluster_log("1 client(s) laggy due to laggy OSDs",
|
||||
timeout=55):
|
||||
# make sure clients weren't evicted
|
||||
self.assert_session_count(2)
|
||||
break
|
||||
except AssertionError:
|
||||
tries -= 1
|
||||
if tries:
|
||||
continue
|
||||
raise
|
||||
with contextutil.safe_while(sleep=5, tries=6) as proceed:
|
||||
while proceed():
|
||||
try:
|
||||
with self.assert_cluster_log("1 client(s) laggy due to"
|
||||
" laggy OSDs",
|
||||
timeout=55):
|
||||
# make sure clients weren't evicted
|
||||
self.assert_session_count(2)
|
||||
break
|
||||
except (AssertionError, CommandFailedError) as e:
|
||||
log.debug(f'{e}, retrying')
|
||||
|
||||
# clear lagginess, expect to get the warning cleared and make sure
|
||||
# client gets evicted
|
||||
self.clear_laggy_params(osd)
|
||||
self.wait_for_health_clear(60)
|
||||
self.assert_session_count(1)
|
||||
finally:
|
||||
self.mount_a.kill_cleanup()
|
||||
self.mount_a.mount_wait()
|
||||
self.mount_a.create_destroy()
|
||||
self.clear_laggy_params(osd)
|
||||
|
||||
def test_client_eviction_if_config_is_unset(self):
|
||||
"""
|
||||
@ -800,6 +803,11 @@ class TestClientOnLaggyOSD(CephFSTestCase):
|
||||
|
||||
time.sleep(session_timeout)
|
||||
self.assert_session_count(1)
|
||||
|
||||
# make sure warning wasn't seen in cluster log
|
||||
with self.assert_cluster_log("laggy due to laggy OSDs",
|
||||
timeout=120, present=False):
|
||||
pass
|
||||
finally:
|
||||
self.mount_a.kill_cleanup()
|
||||
self.mount_a.mount_wait()
|
||||
|
@ -608,8 +608,9 @@ class TestDamage(CephFSTestCase):
|
||||
self.fs.flush()
|
||||
self.config_set("mds", "mds_inject_rename_corrupt_dentry_first", "1.0")
|
||||
time.sleep(5) # for conf to percolate
|
||||
p = self.mount_a.run_shell_payload("timeout 60 mv a/b a/z", wait=False)
|
||||
self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(), timeout=self.fs.beacon_timeout)
|
||||
with self.assert_cluster_log("MDS abort because newly corrupt dentry"):
|
||||
p = self.mount_a.run_shell_payload("timeout 60 mv a/b a/z", wait=False)
|
||||
self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(), timeout=self.fs.beacon_timeout)
|
||||
self.config_rm("mds", "mds_inject_rename_corrupt_dentry_first")
|
||||
self.fs.rank_freeze(False, rank=0)
|
||||
self.delete_mds_coredump(rank0['name'])
|
||||
@ -642,9 +643,10 @@ class TestDamage(CephFSTestCase):
|
||||
rank0 = self.fs.get_rank()
|
||||
self.fs.rank_freeze(True, rank=0)
|
||||
# so now we want to trigger commit but this will crash, so:
|
||||
c = ['--connect-timeout=60', 'tell', f"mds.{fscid}:0", "flush", "journal"]
|
||||
p = self.ceph_cluster.mon_manager.run_cluster_cmd(args=c, wait=False, timeoutcmd=30)
|
||||
self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(), timeout=self.fs.beacon_timeout)
|
||||
with self.assert_cluster_log("MDS abort because newly corrupt dentry"):
|
||||
c = ['--connect-timeout=60', 'tell', f"mds.{fscid}:0", "flush", "journal"]
|
||||
p = self.ceph_cluster.mon_manager.run_cluster_cmd(args=c, wait=False, timeoutcmd=30)
|
||||
self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(), timeout=self.fs.beacon_timeout)
|
||||
self.config_rm("mds", "mds_inject_journal_corrupt_dentry_first")
|
||||
self.fs.rank_freeze(False, rank=0)
|
||||
self.delete_mds_coredump(rank0['name'])
|
||||
|
@ -14,9 +14,12 @@ class TestClusterAffinity(CephFSTestCase):
|
||||
CLIENTS_REQUIRED = 0
|
||||
MDSS_REQUIRED = 4
|
||||
|
||||
def _verify_join_fs(self, target, status=None):
|
||||
def _verify_join_fs(self, target, status=None, fs=None):
|
||||
fs_select = fs
|
||||
if fs_select is None:
|
||||
fs_select = self.fs
|
||||
if status is None:
|
||||
status = self.fs.wait_for_daemons(timeout=30)
|
||||
status = fs_select.wait_for_daemons(timeout=30)
|
||||
log.debug("%s", status)
|
||||
target = sorted(target, key=operator.itemgetter('name'))
|
||||
log.info("target = %s", target)
|
||||
@ -37,11 +40,14 @@ class TestClusterAffinity(CephFSTestCase):
|
||||
return
|
||||
self.fail("no entity")
|
||||
|
||||
def _verify_init(self):
|
||||
status = self.fs.status()
|
||||
def _verify_init(self, fs=None):
|
||||
fs_select = fs
|
||||
if fs_select is None:
|
||||
fs_select = self.fs
|
||||
status = fs_select.status()
|
||||
log.info("status = {0}".format(status))
|
||||
target = [{'join_fscid': -1, 'name': info['name']} for info in status.get_all()]
|
||||
self._verify_join_fs(target, status=status)
|
||||
self._verify_join_fs(target, status=status, fs=fs_select)
|
||||
return (status, target)
|
||||
|
||||
def _reach_target(self, target):
|
||||
@ -109,12 +115,21 @@ class TestClusterAffinity(CephFSTestCase):
|
||||
fs2 = self.mds_cluster.newfs(name="cephfs2")
|
||||
status, target = self._verify_init()
|
||||
active = self.fs.get_active_names(status=status)[0]
|
||||
status2, _ = self._verify_init(fs=fs2)
|
||||
active2 = fs2.get_active_names(status=status2)[0]
|
||||
standbys = [info['name'] for info in status.get_standbys()]
|
||||
victim = standbys.pop()
|
||||
# Set a bogus fs on the others
|
||||
for mds in standbys:
|
||||
self.config_set('mds.'+mds, 'mds_join_fs', 'cephfs2')
|
||||
self._change_target_state(target, mds, {'join_fscid': fs2.id})
|
||||
# The active MDS for cephfs2 will be replaced by the MDS for which
|
||||
# file system affinity has been set. Also, set the affinity for
|
||||
# the earlier active MDS so that it is not chosen by the monitors
|
||||
# as an active MDS for the existing file system.
|
||||
log.info(f'assigning affinity to cephfs2 for active mds (mds.{active2})')
|
||||
self.config_set(f'mds.{active2}', 'mds_join_fs', 'cephfs2')
|
||||
self._change_target_state(target, active2, {'join_fscid': fs2.id})
|
||||
self.fs.rank_fail()
|
||||
self._change_target_state(target, victim, {'state': 'up:active'})
|
||||
self._reach_target(target)
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user