diff --git a/ceph/CMakeLists.txt b/ceph/CMakeLists.txt index 561ce9284..648f6cec1 100644 --- a/ceph/CMakeLists.txt +++ b/ceph/CMakeLists.txt @@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.10.2) # remove cmake/modules/FindPython* once 3.12 is required project(ceph - VERSION 16.2.14 + VERSION 16.2.15 LANGUAGES CXX C ASM) foreach(policy diff --git a/ceph/PendingReleaseNotes b/ceph/PendingReleaseNotes index 8322ba3ad..c02bdf04a 100644 --- a/ceph/PendingReleaseNotes +++ b/ceph/PendingReleaseNotes @@ -32,6 +32,29 @@ in certain recovery scenarios, e.g., monitor database lost and rebuilt, and the restored file system is expected to have the same ID as before. +>=16.2.15 +---------- +* `ceph config dump --format ` output will display the localized + option names instead of its normalized version. For e.g., + "mgr/prometheus/x/server_port" will be displayed instead of + "mgr/prometheus/server_port". This matches the output of the non pretty-print + formatted version of the command. + +* CEPHFS: MDS evicts clients which are not advancing their request tids which causes + a large buildup of session metadata resulting in the MDS going read-only due to + the RADOS operation exceeding the size threshold. `mds_session_metadata_threshold` + config controls the maximum size that a (encoded) session metadata can grow. + +* RADOS: `get_pool_is_selfmanaged_snaps_mode` C++ API has been deprecated + due to being prone to false negative results. It's safer replacement is + `pool_is_in_selfmanaged_snaps_mode`. + +* RBD: When diffing against the beginning of time (`fromsnapname == NULL`) in + fast-diff mode (`whole_object == true` with `fast-diff` image feature enabled + and valid), diff-iterate is now guaranteed to execute locally if exclusive + lock is available. This brings a dramatic performance improvement for QEMU + live disk synchronization and backup use cases. + >= 16.2.14 ---------- @@ -132,6 +155,10 @@ * CEPHFS: After recovering a Ceph File System post following the disaster recovery procedure, the recovered files under `lost+found` directory can now be deleted. * core: cache-tiering is now deprecated. +* mgr/snap_schedule: The snap-schedule mgr module now retains one less snapshot + than the number mentioned against the config tunable `mds_max_snaps_per_dir` + so that a new snapshot can be created and retained during the next schedule + run. >=16.2.8 -------- diff --git a/ceph/admin/doc-requirements.txt b/ceph/admin/doc-requirements.txt index dec096a86..77382b8f4 100644 --- a/ceph/admin/doc-requirements.txt +++ b/ceph/admin/doc-requirements.txt @@ -1,4 +1,4 @@ -Sphinx == 4.4.0 +Sphinx == 5.0.2 git+https://github.com/ceph/sphinx-ditaa.git@py3#egg=sphinx-ditaa breathe >= 4.20.0 Jinja2 diff --git a/ceph/ceph.spec b/ceph/ceph.spec index 912c5f409..567a2809f 100644 --- a/ceph/ceph.spec +++ b/ceph/ceph.spec @@ -135,7 +135,7 @@ # main package definition ################################################################################# Name: ceph -Version: 16.2.14 +Version: 16.2.15 Release: 0%{?dist} %if 0%{?fedora} || 0%{?rhel} Epoch: 2 @@ -151,7 +151,7 @@ License: LGPL-2.1 and LGPL-3.0 and CC-BY-SA-3.0 and GPL-2.0 and BSL-1.0 and BSD- Group: System/Filesystems %endif URL: http://ceph.com/ -Source0: %{?_remote_tarball_prefix}ceph-16.2.14.tar.bz2 +Source0: %{?_remote_tarball_prefix}ceph-16.2.15.tar.bz2 %if 0%{?suse_version} # _insert_obs_source_lines_here ExclusiveArch: x86_64 aarch64 ppc64le s390x @@ -1208,7 +1208,7 @@ This package provides Ceph default alerts for Prometheus. # common ################################################################################# %prep -%autosetup -p1 -n ceph-16.2.14 +%autosetup -p1 -n ceph-16.2.15 %build # Disable lto on systems that do not support symver attribute diff --git a/ceph/changelog.upstream b/ceph/changelog.upstream index 5c9040201..fc8d42060 100644 --- a/ceph/changelog.upstream +++ b/ceph/changelog.upstream @@ -1,7 +1,13 @@ -ceph (16.2.14-1focal) focal; urgency=medium +ceph (16.2.15-1focal) focal; urgency=medium - -- Jenkins Build Slave User Tue, 29 Aug 2023 16:38:35 +0000 + -- Jenkins Build Slave User Mon, 26 Feb 2024 19:34:01 +0000 + +ceph (16.2.15-1) stable; urgency=medium + + * New upstream release + + -- Ceph Release Team Mon, 26 Feb 2024 19:21:07 +0000 ceph (16.2.14-1) stable; urgency=medium diff --git a/ceph/cmake/modules/BuildRocksDB.cmake b/ceph/cmake/modules/BuildRocksDB.cmake index 58ce26a6a..32f8df976 100644 --- a/ceph/cmake/modules/BuildRocksDB.cmake +++ b/ceph/cmake/modules/BuildRocksDB.cmake @@ -56,12 +56,13 @@ function(build_rocksdb) endif() include(CheckCXXCompilerFlag) check_cxx_compiler_flag("-Wno-deprecated-copy" HAS_WARNING_DEPRECATED_COPY) + set(rocksdb_CXX_FLAGS "${CMAKE_CXX_FLAGS}") if(HAS_WARNING_DEPRECATED_COPY) - set(rocksdb_CXX_FLAGS -Wno-deprecated-copy) + string(APPEND rocksdb_CXX_FLAGS " -Wno-deprecated-copy") endif() check_cxx_compiler_flag("-Wno-pessimizing-move" HAS_WARNING_PESSIMIZING_MOVE) if(HAS_WARNING_PESSIMIZING_MOVE) - set(rocksdb_CXX_FLAGS "${rocksdb_CXX_FLAGS} -Wno-pessimizing-move") + string(APPEND rocksdb_CXX_FLAGS " -Wno-pessimizing-move") endif() if(rocksdb_CXX_FLAGS) list(APPEND rocksdb_CMAKE_ARGS -DCMAKE_CXX_FLAGS='${rocksdb_CXX_FLAGS}') diff --git a/ceph/doc/cephfs/administration.rst b/ceph/doc/cephfs/administration.rst index 966b45ee5..ad9599b10 100644 --- a/ceph/doc/cephfs/administration.rst +++ b/ceph/doc/cephfs/administration.rst @@ -15,7 +15,7 @@ creation of multiple file systems use ``ceph fs flag set enable_multiple true``. :: - fs new + ceph fs new This command creates a new file system. The file system name and metadata pool name are self-explanatory. The specified data pool is the default data pool and @@ -25,13 +25,13 @@ to accommodate the new file system. :: - fs ls + ceph fs ls List all file systems by name. :: - fs dump [epoch] + ceph fs dump [epoch] This dumps the FSMap at the given epoch (default: current) which includes all file system settings, MDS daemons and the ranks they hold, and the list of @@ -40,7 +40,7 @@ standby MDS daemons. :: - fs rm [--yes-i-really-mean-it] + ceph fs rm [--yes-i-really-mean-it] Destroy a CephFS file system. This wipes information about the state of the file system from the FSMap. The metadata pool and data pools are untouched and @@ -48,28 +48,28 @@ must be destroyed separately. :: - fs get + ceph fs get Get information about the named file system, including settings and ranks. This -is a subset of the same information from the ``fs dump`` command. +is a subset of the same information from the ``ceph fs dump`` command. :: - fs set + ceph fs set Change a setting on a file system. These settings are specific to the named file system and do not affect other file systems. :: - fs add_data_pool + ceph fs add_data_pool Add a data pool to the file system. This pool can be used for file layouts as an alternate location to store file data. :: - fs rm_data_pool + ceph fs rm_data_pool This command removes the specified pool from the list of data pools for the file system. If any files have layouts for the removed data pool, the file @@ -82,7 +82,7 @@ Settings :: - fs set max_file_size + ceph fs set max_file_size CephFS has a configurable maximum file size, and it's 1TB by default. You may wish to set this limit higher if you expect to store large files @@ -116,13 +116,13 @@ Taking a CephFS cluster down is done by setting the down flag: :: - fs set down true + ceph fs set down true To bring the cluster back online: :: - fs set down false + ceph fs set down false This will also restore the previous value of max_mds. MDS daemons are brought down in a way such that journals are flushed to the metadata pool and all @@ -133,11 +133,11 @@ Taking the cluster down rapidly for deletion or disaster recovery ----------------------------------------------------------------- To allow rapidly deleting a file system (for testing) or to quickly bring the -file system and MDS daemons down, use the ``fs fail`` command: +file system and MDS daemons down, use the ``ceph fs fail`` command: :: - fs fail + ceph fs fail This command sets a file system flag to prevent standbys from activating on the file system (the ``joinable`` flag). @@ -146,7 +146,7 @@ This process can also be done manually by doing the following: :: - fs set joinable false + ceph fs set joinable false Then the operator can fail all of the ranks which causes the MDS daemons to respawn as standbys. The file system will be left in a degraded state. @@ -154,7 +154,7 @@ respawn as standbys. The file system will be left in a degraded state. :: # For all ranks, 0-N: - mds fail : + ceph mds fail : Once all ranks are inactive, the file system may also be deleted or left in this state for other purposes (perhaps disaster recovery). @@ -163,7 +163,7 @@ To bring the cluster back up, simply set the joinable flag: :: - fs set joinable true + ceph fs set joinable true Daemons @@ -182,34 +182,35 @@ Commands to manipulate MDS daemons: :: - mds fail + ceph mds fail Mark an MDS daemon as failed. This is equivalent to what the cluster would do if an MDS daemon had failed to send a message to the mon for ``mds_beacon_grace`` second. If the daemon was active and a suitable -standby is available, using ``mds fail`` will force a failover to the standby. +standby is available, using ``ceph mds fail`` will force a failover to the +standby. -If the MDS daemon was in reality still running, then using ``mds fail`` +If the MDS daemon was in reality still running, then using ``ceph mds fail`` will cause the daemon to restart. If it was active and a standby was available, then the "failed" daemon will return as a standby. :: - tell mds. command ... + ceph tell mds. command ... Send a command to the MDS daemon(s). Use ``mds.*`` to send a command to all daemons. Use ``ceph tell mds.* help`` to learn available commands. :: - mds metadata + ceph mds metadata Get metadata about the given MDS known to the Monitors. :: - mds repaired + ceph mds repaired Mark the file system rank as repaired. Unlike the name suggests, this command does not change a MDS; it manipulates the file system rank which has been @@ -228,14 +229,14 @@ Commands to manipulate required client features of a file system: :: - fs required_client_features add reply_encoding - fs required_client_features rm reply_encoding + ceph fs required_client_features add reply_encoding + ceph fs required_client_features rm reply_encoding To list all CephFS features :: - fs feature ls + ceph fs feature ls Clients that are missing newly added features will be evicted automatically. @@ -330,7 +331,7 @@ Global settings :: - fs flag set [] + ceph fs flag set [] Sets a global CephFS flag (i.e. not specific to a particular file system). Currently, the only flag setting is 'enable_multiple' which allows having @@ -352,13 +353,13 @@ file system. :: - mds rmfailed + ceph mds rmfailed This removes a rank from the failed set. :: - fs reset + ceph fs reset This command resets the file system state to defaults, except for the name and pools. Non-zero ranks are saved in the stopped set. @@ -366,7 +367,7 @@ pools. Non-zero ranks are saved in the stopped set. :: - fs new --fscid --force + ceph fs new --fscid --force This command creates a file system with a specific **fscid** (file system cluster ID). You may want to do this when an application expects the file system's ID to be diff --git a/ceph/doc/cephfs/cephfs-shell.rst b/ceph/doc/cephfs/cephfs-shell.rst index 2fa1decbb..6e6a1b1e3 100644 --- a/ceph/doc/cephfs/cephfs-shell.rst +++ b/ceph/doc/cephfs/cephfs-shell.rst @@ -37,7 +37,7 @@ Options : .. code:: bash [build]$ python3 -m venv venv && source venv/bin/activate && pip3 install cmd2 - [build]$ source vstart_environment.sh && source venv/bin/activate && python3 ../src/tools/cephfs/cephfs-shell + [build]$ source vstart_environment.sh && source venv/bin/activate && python3 ../src/tools/cephfs/shell/cephfs-shell Commands ======== diff --git a/ceph/doc/cephfs/client-auth.rst b/ceph/doc/cephfs/client-auth.rst index fd0faa839..a7dea5251 100644 --- a/ceph/doc/cephfs/client-auth.rst +++ b/ceph/doc/cephfs/client-auth.rst @@ -24,6 +24,16 @@ that directory. To restrict clients to only mount and work within a certain directory, use path-based MDS authentication capabilities. +Note that this restriction *only* impacts the filesystem hierarchy -- the metadata +tree managed by the MDS. Clients will still be able to access the underlying +file data in RADOS directly. To segregate clients fully, you must also isolate +untrusted clients in their own RADOS namespace. You can place a client's +filesystem subtree in a particular namespace using `file layouts`_ and then +restrict their RADOS access to that namespace using `OSD capabilities`_ + +.. _file layouts: ./file-layouts +.. _OSD capabilities: ../rados/operations/user-management/#authorization-capabilities + Syntax ------ diff --git a/ceph/doc/cephfs/snap-schedule.rst b/ceph/doc/cephfs/snap-schedule.rst index fb9c85b5a..5c4b2c1de 100644 --- a/ceph/doc/cephfs/snap-schedule.rst +++ b/ceph/doc/cephfs/snap-schedule.rst @@ -38,6 +38,13 @@ below). By default the start time is last midnight. So when a snapshot schedule with repeat interval `1h` is added at 13:50 with the default start time, the first snapshot will be taken at 14:00. +The time zone is assumed to be UTC if none is explicitly included in the string. +An explicit time zone will be mapped to UTC at execution. +The start time must be in ISO8601 format. Examples below: + +UTC: 2022-08-08T05:30:00 i.e. 5:30 AM UTC, without explicit time zone offset +IDT: 2022-08-08T09:00:00+03:00 i.e. 6:00 AM UTC +EDT: 2022-08-08T05:30:00-04:00 i.e. 9:30 AM UTC Retention specifications are identified by path and the retention spec itself. A retention spec consists of either a number and a time period separated by a @@ -155,6 +162,11 @@ Examples:: snapshot creation is accounted for in the "created_count" field, which is a cumulative count of the total number of snapshots created so far. +.. note: The maximum number of snapshots to retain per directory is limited by the + config tunable `mds_max_snaps_per_dir`. This tunable defaults to 100. + To ensure a new snapshot can be created, one snapshot less than this will be + retained. So by default, a maximum of 99 snapshots will be retained. + Active and inactive schedules ----------------------------- Snapshot schedules can be added for a path that doesn't exist yet in the diff --git a/ceph/doc/man/8/ceph-objectstore-tool.rst b/ceph/doc/man/8/ceph-objectstore-tool.rst index 19acc5913..7d1e2219e 100644 --- a/ceph/doc/man/8/ceph-objectstore-tool.rst +++ b/ceph/doc/man/8/ceph-objectstore-tool.rst @@ -60,6 +60,8 @@ Possible -op commands:: * meta-list * get-osdmap * set-osdmap +* get-superblock +* set-superblock * get-inc-osdmap * set-inc-osdmap * mark-complete @@ -414,7 +416,7 @@ Options .. option:: --op arg - Arg is one of [info, log, remove, mkfs, fsck, repair, fuse, dup, export, export-remove, import, list, fix-lost, list-pgs, dump-journal, dump-super, meta-list, get-osdmap, set-osdmap, get-inc-osdmap, set-inc-osdmap, mark-complete, reset-last-complete, apply-layout-settings, update-mon-db, dump-export, trim-pg-log] + Arg is one of [info, log, remove, mkfs, fsck, repair, fuse, dup, export, export-remove, import, list, fix-lost, list-pgs, dump-journal, dump-super, meta-list, get-osdmap, set-osdmap, get-superblock, set-superblock, get-inc-osdmap, set-inc-osdmap, mark-complete, reset-last-complete, apply-layout-settings, update-mon-db, dump-export, trim-pg-log] .. option:: --epoch arg @@ -422,7 +424,7 @@ Options .. option:: --file arg - path of file to export, export-remove, import, get-osdmap, set-osdmap, get-inc-osdmap or set-inc-osdmap + path of file to export, export-remove, import, get-osdmap, set-osdmap, get-superblock, set-superblock, get-inc-osdmap or set-inc-osdmap .. option:: --mon-store-path arg diff --git a/ceph/doc/man/8/ceph.rst b/ceph/doc/man/8/ceph.rst index 7fefe58d6..c913b45de 100644 --- a/ceph/doc/man/8/ceph.rst +++ b/ceph/doc/man/8/ceph.rst @@ -1314,7 +1314,7 @@ Subcommand ``cache-mode`` specifies the caching mode for cache tier . Usage:: - ceph osd tier cache-mode writeback|readproxy|readonly|none + ceph osd tier cache-mode writeback|proxy|readproxy|readonly|none Subcommand ``remove`` removes the tier (the second one) from base pool (the first one). diff --git a/ceph/doc/man/8/rados.rst b/ceph/doc/man/8/rados.rst index 147313f14..0bbf4b87a 100644 --- a/ceph/doc/man/8/rados.rst +++ b/ceph/doc/man/8/rados.rst @@ -264,8 +264,8 @@ Pool specific commands :command:`append` *name* *infile* Append object name to the cluster with contents from infile. -:command:`rm` *name* - Remove object name. +:command:`rm` [--force-full] *name* ... + Remove object(s) with name(s). With ``--force-full`` will remove when cluster is marked full. :command:`listwatchers` *name* List the watchers of object name. diff --git a/ceph/doc/rados/configuration/bluestore-config-ref.rst b/ceph/doc/rados/configuration/bluestore-config-ref.rst index 6ad316050..1e9b1a0da 100644 --- a/ceph/doc/rados/configuration/bluestore-config-ref.rst +++ b/ceph/doc/rados/configuration/bluestore-config-ref.rst @@ -333,7 +333,7 @@ OSD and run the following command: ceph-bluestore-tool \ --path \ - --sharding="m(3) p(3,0-12) o(3,0-13)=block_cache={type=binned_lru} l p" \ + --sharding="m(3) p(3,0-12) O(3,0-13)=block_cache={type=binned_lru} L P" \ reshard diff --git a/ceph/doc/rados/configuration/ms-ref.rst b/ceph/doc/rados/configuration/ms-ref.rst index 113bd0913..6ccdebaf1 100644 --- a/ceph/doc/rados/configuration/ms-ref.rst +++ b/ceph/doc/rados/configuration/ms-ref.rst @@ -109,17 +109,6 @@ Async messenger options :Default: ``3`` -``ms_async_max_op_threads`` - -:Description: Maximum number of worker threads used by each Async Messenger instance. - Set to lower values when your machine has limited CPU count, and increase - when your CPUs are underutilized (i. e. one or more of CPUs are - constantly on 100% load during I/O operations). -:Type: 64-bit Unsigned Integer -:Required: No -:Default: ``5`` - - ``ms_async_send_inline`` :Description: Send messages directly from the thread that generated them instead of @@ -129,5 +118,3 @@ Async messenger options :Type: Boolean :Required: No :Default: ``false`` - - diff --git a/ceph/doc/rados/configuration/pool-pg-config-ref.rst b/ceph/doc/rados/configuration/pool-pg-config-ref.rst index 3f9c149b6..3a1ff9b68 100644 --- a/ceph/doc/rados/configuration/pool-pg-config-ref.rst +++ b/ceph/doc/rados/configuration/pool-pg-config-ref.rst @@ -4,12 +4,41 @@ .. index:: pools; configuration -Ceph uses default values to determine how many placement groups (PGs) will be -assigned to each pool. We recommend overriding some of the defaults. -Specifically, we recommend setting a pool's replica size and overriding the -default number of placement groups. You can set these values when running -`pool`_ commands. You can also override the defaults by adding new ones in the -``[global]`` section of your Ceph configuration file. +The number of placement groups that the CRUSH algorithm assigns to each pool is +determined by the values of variables in the centralized configuration database +in the monitor cluster. + +Both containerized deployments of Ceph (deployments made using ``cephadm`` or +Rook) and non-containerized deployments of Ceph rely on the values in the +central configuration database in the monitor cluster to assign placement +groups to pools. + +Example Commands +---------------- + +To see the value of the variable that governs the number of placement groups in a given pool, run a command of the following form: + +.. prompt:: bash + + ceph config get osd osd_pool_default_pg_num + +To set the value of the variable that governs the number of placement groups in a given pool, run a command of the following form: + +.. prompt:: bash + + ceph config set osd osd_pool_default_pg_num + +Manual Tuning +------------- +In some cases, it might be advisable to override some of the defaults. For +example, you might determine that it is wise to set a pool's replica size and +to override the default number of placement groups in the pool. You can set +these values when running `pool`_ commands. + +See Also +-------- + +See :ref:`pg-autoscaler`. .. literalinclude:: pool-pg.conf diff --git a/ceph/doc/rados/operations/health-checks.rst b/ceph/doc/rados/operations/health-checks.rst index 0282f1834..b0b768c22 100644 --- a/ceph/doc/rados/operations/health-checks.rst +++ b/ceph/doc/rados/operations/health-checks.rst @@ -1404,6 +1404,31 @@ other performance issue with the OSDs. The exact size of the snapshot trim queue is reported by the ``snaptrimq_len`` field of ``ceph pg ls -f json-detail``. +Stretch Mode +------------ + +INCORRECT_NUM_BUCKETS_STRETCH_MODE +__________________________________ + +Stretch mode currently only support 2 dividing buckets with OSDs, this warning suggests +that the number of dividing buckets is not equal to 2 after stretch mode is enabled. +You can expect unpredictable failures and MON assertions until the condition is fixed. + +We encourage you to fix this by removing additional dividing buckets or bump the +number of dividing buckets to 2. + +UNEVEN_WEIGHTS_STRETCH_MODE +___________________________ + +The 2 dividing buckets must have equal weights when stretch mode is enabled. +This warning suggests that the 2 dividing buckets have uneven weights after +stretch mode is enabled. This is not immediately fatal, however, you can expect +Ceph to be confused when trying to process transitions between dividing buckets. + +We encourage you to fix this by making the weights even on both dividing buckets. +This can be done by making sure the combined weight of the OSDs on each dividing +bucket are the same. + Miscellaneous ------------- diff --git a/ceph/doc/radosgw/frontends.rst b/ceph/doc/radosgw/frontends.rst index 274cdce87..e1aa1571a 100644 --- a/ceph/doc/radosgw/frontends.rst +++ b/ceph/doc/radosgw/frontends.rst @@ -127,6 +127,14 @@ Options :Type: Integer :Default: ``65000`` +``max_header_size`` + +:Description: The maximum number of header bytes available for a single request. + +:Type: Integer +:Default: ``16384`` +:Maximum: ``65536`` + Civetweb ======== diff --git a/ceph/make-dist b/ceph/make-dist index 75528c1b8..9765b0b03 100755 --- a/ceph/make-dist +++ b/ceph/make-dist @@ -55,7 +55,7 @@ download_from() { exit fi url=$url_base/$fname - wget -c --no-verbose -O $fname $url + wget --no-verbose -O $fname $url if [ $? != 0 -o ! -e $fname ]; then echo "Download of $url failed" elif [ $(sha256sum $fname | awk '{print $1}') != $sha256 ]; then @@ -183,8 +183,7 @@ download_boost $boost_version 4eb3b8d442b426dc35346235c8733b5ae35ba431690e38c6a8 https://boostorg.jfrog.io/artifactory/main/release/$boost_version/source \ https://downloads.sourceforge.net/project/boost/boost/$boost_version \ https://download.ceph.com/qa -download_liburing 0.7 8e2842cfe947f3a443af301bdd6d034455536c38a455c7a700d0c1ad165a7543 \ - https://github.com/axboe/liburing/archive \ +download_liburing 0.7 05d0cf8493d573c76b11abfcf34aabc7153affebe17ff95f9ae88b0de062a59d \ https://git.kernel.dk/cgit/liburing/snapshot pmdk_version=1.10 download_pmdk $pmdk_version 08dafcf94db5ac13fac9139c92225d9aa5f3724ea74beee4e6ca19a01a2eb20c \ diff --git a/ceph/monitoring/ceph-mixin/dashboards/osd.libsonnet b/ceph/monitoring/ceph-mixin/dashboards/osd.libsonnet index 0ea43c96f..0015c7f39 100644 --- a/ceph/monitoring/ceph-mixin/dashboards/osd.libsonnet +++ b/ceph/monitoring/ceph-mixin/dashboards/osd.libsonnet @@ -342,7 +342,7 @@ local g = import 'grafonnet/grafana.libsonnet'; $.graphPanelSchema({}, title, description, - 'null', + 'null as zero', false, formatY1, 'short', diff --git a/ceph/monitoring/ceph-mixin/dashboards/rbd.libsonnet b/ceph/monitoring/ceph-mixin/dashboards/rbd.libsonnet index 0eca5a877..709d4e04f 100644 --- a/ceph/monitoring/ceph-mixin/dashboards/rbd.libsonnet +++ b/ceph/monitoring/ceph-mixin/dashboards/rbd.libsonnet @@ -133,7 +133,7 @@ local u = import 'utils.libsonnet'; $.graphPanelSchema({}, title, '', - 'null', + 'null as zero', false, formatY1, 'short', diff --git a/ceph/monitoring/ceph-mixin/dashboards/rgw.libsonnet b/ceph/monitoring/ceph-mixin/dashboards/rgw.libsonnet index 892480d1c..49dcf9156 100644 --- a/ceph/monitoring/ceph-mixin/dashboards/rgw.libsonnet +++ b/ceph/monitoring/ceph-mixin/dashboards/rgw.libsonnet @@ -140,7 +140,7 @@ local u = import 'utils.libsonnet'; {}, title, description, - 'null', + 'null as zero', false, formatY1, formatY2, @@ -658,7 +658,7 @@ local u = import 'utils.libsonnet'; $.graphPanelSchema(aliasColors, title, description, - 'null', + 'null as zero', false, formatY1, formatY2, diff --git a/ceph/monitoring/ceph-mixin/dashboards_out/osd-device-details.json b/ceph/monitoring/ceph-mixin/dashboards_out/osd-device-details.json index 384516fb0..811e6d57e 100644 --- a/ceph/monitoring/ceph-mixin/dashboards_out/osd-device-details.json +++ b/ceph/monitoring/ceph-mixin/dashboards_out/osd-device-details.json @@ -87,7 +87,7 @@ "lines": true, "linewidth": 1, "links": [ ], - "nullPointMode": "null", + "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, @@ -185,7 +185,7 @@ "lines": true, "linewidth": 1, "links": [ ], - "nullPointMode": "null", + "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, @@ -283,7 +283,7 @@ "lines": true, "linewidth": 1, "links": [ ], - "nullPointMode": "null", + "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, @@ -400,7 +400,7 @@ "lines": true, "linewidth": 1, "links": [ ], - "nullPointMode": "null", + "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, @@ -498,7 +498,7 @@ "lines": true, "linewidth": 1, "links": [ ], - "nullPointMode": "null", + "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, @@ -596,7 +596,7 @@ "lines": true, "linewidth": 1, "links": [ ], - "nullPointMode": "null", + "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, diff --git a/ceph/monitoring/ceph-mixin/dashboards_out/radosgw-detail.json b/ceph/monitoring/ceph-mixin/dashboards_out/radosgw-detail.json index a0f8f3537..4568f9a4d 100644 --- a/ceph/monitoring/ceph-mixin/dashboards_out/radosgw-detail.json +++ b/ceph/monitoring/ceph-mixin/dashboards_out/radosgw-detail.json @@ -93,7 +93,7 @@ "lines": true, "linewidth": 1, "links": [ ], - "nullPointMode": "null", + "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, @@ -186,7 +186,7 @@ "lines": true, "linewidth": 1, "links": [ ], - "nullPointMode": "null", + "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, @@ -285,7 +285,7 @@ "lines": true, "linewidth": 1, "links": [ ], - "nullPointMode": "null", + "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, diff --git a/ceph/monitoring/ceph-mixin/dashboards_out/radosgw-overview.json b/ceph/monitoring/ceph-mixin/dashboards_out/radosgw-overview.json index 77d69e4f3..a8256c1f5 100644 --- a/ceph/monitoring/ceph-mixin/dashboards_out/radosgw-overview.json +++ b/ceph/monitoring/ceph-mixin/dashboards_out/radosgw-overview.json @@ -87,7 +87,7 @@ "lines": true, "linewidth": 1, "links": [ ], - "nullPointMode": "null", + "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, @@ -180,7 +180,7 @@ "lines": true, "linewidth": 1, "links": [ ], - "nullPointMode": "null", + "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, @@ -266,7 +266,7 @@ "lines": true, "linewidth": 1, "links": [ ], - "nullPointMode": "null", + "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, @@ -352,7 +352,7 @@ "lines": true, "linewidth": 1, "links": [ ], - "nullPointMode": "null", + "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, @@ -445,7 +445,7 @@ "lines": true, "linewidth": 1, "links": [ ], - "nullPointMode": "null", + "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, @@ -531,7 +531,7 @@ "lines": true, "linewidth": 1, "links": [ ], - "nullPointMode": "null", + "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, @@ -636,7 +636,7 @@ "lines": true, "linewidth": 1, "links": [ ], - "nullPointMode": "null", + "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, @@ -754,7 +754,7 @@ "lines": true, "linewidth": 1, "links": [ ], - "nullPointMode": "null", + "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, @@ -893,7 +893,7 @@ "lines": true, "linewidth": 1, "links": [ ], - "nullPointMode": "null", + "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, @@ -1000,7 +1000,7 @@ "lines": true, "linewidth": 1, "links": [ ], - "nullPointMode": "null", + "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, diff --git a/ceph/monitoring/ceph-mixin/dashboards_out/rbd-overview.json b/ceph/monitoring/ceph-mixin/dashboards_out/rbd-overview.json index e017280e0..86b354a60 100644 --- a/ceph/monitoring/ceph-mixin/dashboards_out/rbd-overview.json +++ b/ceph/monitoring/ceph-mixin/dashboards_out/rbd-overview.json @@ -80,7 +80,7 @@ "lines": true, "linewidth": 1, "links": [ ], - "nullPointMode": "null", + "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, @@ -173,7 +173,7 @@ "lines": true, "linewidth": 1, "links": [ ], - "nullPointMode": "null", + "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, @@ -266,7 +266,7 @@ "lines": true, "linewidth": 1, "links": [ ], - "nullPointMode": "null", + "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, diff --git a/ceph/monitoring/ceph-mixin/prometheus_alerts.yml b/ceph/monitoring/ceph-mixin/prometheus_alerts.yml index 3b560342e..33a5c5059 100644 --- a/ceph/monitoring/ceph-mixin/prometheus_alerts.yml +++ b/ceph/monitoring/ceph-mixin/prometheus_alerts.yml @@ -518,7 +518,7 @@ groups: annotations: description: "Pool '{{ $labels.name }}' will be full in less than 5 days assuming the average fill-up rate of the past 48 hours." summary: "Pool growth rate may soon exceed capacity" - expr: "(predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(pool_id) group_right ceph_pool_metadata) >= 95" + expr: "(predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(pool_id, instance) group_right() ceph_pool_metadata) >= 95" labels: oid: "1.3.6.1.4.1.50495.1.2.1.9.2" severity: "warning" diff --git a/ceph/monitoring/ceph-mixin/tests_alerts/test_alerts.yml b/ceph/monitoring/ceph-mixin/tests_alerts/test_alerts.yml index ec90743d4..d68b43bad 100644 --- a/ceph/monitoring/ceph-mixin/tests_alerts/test_alerts.yml +++ b/ceph/monitoring/ceph-mixin/tests_alerts/test_alerts.yml @@ -1499,35 +1499,44 @@ tests: # trigger percent full prediction on pools 1 and 2 only - interval: 12h input_series: - - series: 'ceph_pool_percent_used{pool_id="1"}' - values: '70 75 80 87 92' - - series: 'ceph_pool_percent_used{pool_id="2"}' - values: '22 22 23 23 24' - - series: 'ceph_pool_metadata{pool_id="1",name="rbd",type="replicated"}' + - series: 'ceph_pool_percent_used{pool_id="1", instance="9090"}' values: '1 1 1 1 1' - - series: 'ceph_pool_metadata{pool_id="2",name="default.rgw.index",type="replicated"}' + - series: 'ceph_pool_percent_used{pool_id="1", instance="8090"}' + values: '78 89 79 98 78' + - series: 'ceph_pool_percent_used{pool_id="2", instance="9090"}' + values: '1 1 1 1 1' + - series: 'ceph_pool_percent_used{pool_id="2", instance="8090"}' + values: '22 22 23 23 24' + - series: 'ceph_pool_metadata{pool_id="1" , instance="9090" ,name="rbd",type="replicated"}' + values: '1 1 1 1 1' + - series: 'ceph_pool_metadata{pool_id="1", instance="8090",name="default.rgw.index",type="replicated"}' + values: '1 1 1 1 1' + - series: 'ceph_pool_metadata{pool_id="2" , instance="9090" ,name="rbd",type="replicated"}' + values: '1 1 1 1 1' + - series: 'ceph_pool_metadata{pool_id="2", instance="8090",name="default.rgw.index",type="replicated"}' values: '1 1 1 1 1' promql_expr_test: - expr: | - (predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(pool_id) - group_right ceph_pool_metadata) >= 95 + (predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(pool_id, instance) + group_right() ceph_pool_metadata) >= 95 eval_time: 36h exp_samples: - - labels: '{name="rbd",pool_id="1",type="replicated"}' - value: 1.424E+02 # 142% + - labels: '{instance="8090",name="default.rgw.index",pool_id="1",type="replicated"}' + value: 1.435E+02 # 142% alert_rule_test: - eval_time: 48h alertname: CephPoolGrowthWarning exp_alerts: - exp_labels: - name: rbd + instance: 8090 + name: default.rgw.index pool_id: 1 severity: warning type: ceph_default oid: 1.3.6.1.4.1.50495.1.2.1.9.2 exp_annotations: summary: Pool growth rate may soon exceed capacity - description: Pool 'rbd' will be full in less than 5 days assuming the average fill-up rate of the past 48 hours. + description: Pool 'default.rgw.index' will be full in less than 5 days assuming the average fill-up rate of the past 48 hours. - interval: 1m input_series: - series: 'ceph_health_detail{name="POOL_BACKFILLFULL"}' diff --git a/ceph/qa/cephfs/conf/mds.yaml b/ceph/qa/cephfs/conf/mds.yaml index 46403ffa0..b1c7a5869 100644 --- a/ceph/qa/cephfs/conf/mds.yaml +++ b/ceph/qa/cephfs/conf/mds.yaml @@ -3,6 +3,7 @@ overrides: conf: mds: debug mds: 20 + debug mds balancer: 20 debug ms: 1 mds debug frag: true mds debug scatterstat: true diff --git a/ceph/qa/cephfs/overrides/ignorelist_health.yaml b/ceph/qa/cephfs/overrides/ignorelist_health.yaml index 7f0d49eab..a698da517 100644 --- a/ceph/qa/cephfs/overrides/ignorelist_health.yaml +++ b/ceph/qa/cephfs/overrides/ignorelist_health.yaml @@ -2,7 +2,10 @@ overrides: ceph: log-ignorelist: - overall HEALTH_ + - \(CEPHADM_STRAY_DAEMON\) - \(FS_DEGRADED\) + - FS_ + - \(CEPHADM_ - \(MDS_FAILED\) - \(MDS_DEGRADED\) - \(FS_WITH_FAILED_MDS\) @@ -10,3 +13,10 @@ overrides: - \(MDS_ALL_DOWN\) - \(MDS_UP_LESS_THAN_MAX\) - \(FS_INLINE_DATA_DEPRECATED\) + - \(PG_DEGRADED\) + - Degraded data redundancy + - \(PG_ + - acting + - MDS_INSUFFICIENT_STANDBY + - deprecated feature inline_data + - compat changed unexpectedly diff --git a/ceph/qa/cephfs/overrides/ignorelist_wrongly_marked_down.yaml b/ceph/qa/cephfs/overrides/ignorelist_wrongly_marked_down.yaml index 41ba84f04..64c8c24f5 100644 --- a/ceph/qa/cephfs/overrides/ignorelist_wrongly_marked_down.yaml +++ b/ceph/qa/cephfs/overrides/ignorelist_wrongly_marked_down.yaml @@ -2,8 +2,10 @@ overrides: ceph: log-ignorelist: - overall HEALTH_ - - \(OSD_DOWN\) - - \(OSD_ + - OSD_DOWN + - OSD_ - but it is still running # MDS daemon 'b' is not responding, replacing it as rank 0 with standby 'a' - is not responding + - is down + - osds down diff --git a/ceph/qa/distros/all/rhel_8.5.yaml b/ceph/qa/distros/all/rhel_8.5.yaml new file mode 100644 index 000000000..12713a45e --- /dev/null +++ b/ceph/qa/distros/all/rhel_8.5.yaml @@ -0,0 +1,6 @@ +os_type: rhel +os_version: "8.5" +overrides: + selinux: + whitelist: + - scontext=system_u:system_r:logrotate_t:s0 diff --git a/ceph/qa/distros/all/rhel_8.6.yaml b/ceph/qa/distros/all/rhel_8.6.yaml new file mode 100644 index 000000000..34c3f10be --- /dev/null +++ b/ceph/qa/distros/all/rhel_8.6.yaml @@ -0,0 +1,6 @@ +os_type: rhel +os_version: "8.6" +overrides: + selinux: + whitelist: + - scontext=system_u:system_r:logrotate_t:s0 diff --git a/ceph/qa/distros/all/rhel_8.yaml b/ceph/qa/distros/all/rhel_8.yaml index c7867a423..d49c09cc1 120000 --- a/ceph/qa/distros/all/rhel_8.yaml +++ b/ceph/qa/distros/all/rhel_8.yaml @@ -1 +1 @@ -rhel_8.4.yaml \ No newline at end of file +rhel_8.6.yaml \ No newline at end of file diff --git a/ceph/qa/distros/container-hosts/rhel_8.4_container_tools_3.0.yaml b/ceph/qa/distros/container-hosts/rhel_8.4_container_tools_3.0.yaml deleted file mode 120000 index 9e1ab9a0e..000000000 --- a/ceph/qa/distros/container-hosts/rhel_8.4_container_tools_3.0.yaml +++ /dev/null @@ -1 +0,0 @@ -.qa/distros/podman/rhel_8.4_container_tools_3.0.yaml \ No newline at end of file diff --git a/ceph/qa/distros/container-hosts/rhel_8.4_container_tools_rhel8.yaml b/ceph/qa/distros/container-hosts/rhel_8.4_container_tools_rhel8.yaml deleted file mode 120000 index b4b0a7892..000000000 --- a/ceph/qa/distros/container-hosts/rhel_8.4_container_tools_rhel8.yaml +++ /dev/null @@ -1 +0,0 @@ -.qa/distros/podman/rhel_8.4_container_tools_rhel8.yaml \ No newline at end of file diff --git a/ceph/qa/distros/container-hosts/rhel_8.6_container_tools_3.0.yaml b/ceph/qa/distros/container-hosts/rhel_8.6_container_tools_3.0.yaml new file mode 120000 index 000000000..91f9539f1 --- /dev/null +++ b/ceph/qa/distros/container-hosts/rhel_8.6_container_tools_3.0.yaml @@ -0,0 +1 @@ +.qa/distros/podman/rhel_8.6_container_tools_3.0.yaml \ No newline at end of file diff --git a/ceph/qa/distros/container-hosts/rhel_8.6_container_tools_rhel8.yaml b/ceph/qa/distros/container-hosts/rhel_8.6_container_tools_rhel8.yaml new file mode 120000 index 000000000..c9abcd7b8 --- /dev/null +++ b/ceph/qa/distros/container-hosts/rhel_8.6_container_tools_rhel8.yaml @@ -0,0 +1 @@ +.qa/distros/podman/rhel_8.6_container_tools_rhel8.yaml \ No newline at end of file diff --git a/ceph/qa/distros/podman/rhel_8.4_container_tools_3.0.yaml b/ceph/qa/distros/podman/rhel_8.6_container_tools_3.0.yaml similarity index 95% rename from ceph/qa/distros/podman/rhel_8.4_container_tools_3.0.yaml rename to ceph/qa/distros/podman/rhel_8.6_container_tools_3.0.yaml index 01b1ad51e..361d8546e 100644 --- a/ceph/qa/distros/podman/rhel_8.4_container_tools_3.0.yaml +++ b/ceph/qa/distros/podman/rhel_8.6_container_tools_3.0.yaml @@ -1,5 +1,5 @@ os_type: rhel -os_version: "8.4" +os_version: "8.6" overrides: selinux: whitelist: diff --git a/ceph/qa/distros/podman/rhel_8.4_container_tools_rhel8.yaml b/ceph/qa/distros/podman/rhel_8.6_container_tools_rhel8.yaml similarity index 95% rename from ceph/qa/distros/podman/rhel_8.4_container_tools_rhel8.yaml rename to ceph/qa/distros/podman/rhel_8.6_container_tools_rhel8.yaml index 741868eb4..be94ed69e 100644 --- a/ceph/qa/distros/podman/rhel_8.4_container_tools_rhel8.yaml +++ b/ceph/qa/distros/podman/rhel_8.6_container_tools_rhel8.yaml @@ -1,5 +1,5 @@ os_type: rhel -os_version: "8.4" +os_version: "8.6" overrides: selinux: whitelist: diff --git a/ceph/qa/standalone/ceph-helpers.sh b/ceph/qa/standalone/ceph-helpers.sh index ce0d749b4..26ba3a084 100755 --- a/ceph/qa/standalone/ceph-helpers.sh +++ b/ceph/qa/standalone/ceph-helpers.sh @@ -1691,6 +1691,29 @@ function test_wait_for_peered() { ####################################################################### +## +# Wait until the cluster's health condition disappeared. +# $TIMEOUT default +# +# @param string to grep for in health detail +# @return 0 if the cluster health doesn't matches request, +# 1 otherwise if after $TIMEOUT seconds health condition remains. +# +function wait_for_health_gone() { + local grepstr=$1 + local -a delays=($(get_timeout_delays $TIMEOUT .1)) + local -i loop=0 + + while ceph health detail | grep "$grepstr" ; do + if (( $loop >= ${#delays[*]} )) ; then + ceph health detail + return 1 + fi + sleep ${delays[$loop]} + loop+=1 + done +} + ## # Wait until the cluster has health condition passed as arg # again for $TIMEOUT seconds. diff --git a/ceph/qa/standalone/mon-stretch/mon-stretch-fail-recovery.sh b/ceph/qa/standalone/mon-stretch/mon-stretch-fail-recovery.sh new file mode 100755 index 000000000..f61b51f76 --- /dev/null +++ b/ceph/qa/standalone/mon-stretch/mon-stretch-fail-recovery.sh @@ -0,0 +1,148 @@ +#!/usr/bin/env bash + +source $CEPH_ROOT/qa/standalone/ceph-helpers.sh +function run() { + local dir=$1 + shift + + export CEPH_MON_A="127.0.0.1:7139" # git grep '\<7139\>' : there must be only one + export CEPH_MON_B="127.0.0.1:7141" # git grep '\<7141\>' : there must be only one + export CEPH_MON_C="127.0.0.1:7142" # git grep '\<7142\>' : there must be only one + export CEPH_MON_D="127.0.0.1:7143" # git grep '\<7143\>' : there must be only one + export CEPH_MON_E="127.0.0.1:7144" # git grep '\<7144\>' : there must be only one + export CEPH_ARGS + CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none " + + export BASE_CEPH_ARGS=$CEPH_ARGS + CEPH_ARGS+="--mon-host=$CEPH_MON_A" + + local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')} + for func in $funcs ; do + setup $dir || return 1 + $func $dir || return 1 + teardown $dir || return 1 + done +} +TEST_stretched_cluster_failover_add_three_osds(){ + local dir=$1 + local OSDS=8 + setup $dir || return 1 + + run_mon $dir a --public-addr $CEPH_MON_A || return 1 + wait_for_quorum 300 1 || return 1 + + run_mon $dir b --public-addr $CEPH_MON_B || return 1 + CEPH_ARGS="$BASE_CEPH_ARGS --mon-host=$CEPH_MON_A,$CEPH_MON_B" + wait_for_quorum 300 2 || return 1 + + run_mon $dir c --public-addr $CEPH_MON_C || return 1 + CEPH_ARGS="$BASE_CEPH_ARGS --mon-host=$CEPH_MON_A,$CEPH_MON_B,$CEPH_MON_C" + wait_for_quorum 300 3 || return 1 + + run_mon $dir d --public-addr $CEPH_MON_D || return 1 + CEPH_ARGS="$BASE_CEPH_ARGS --mon-host=$CEPH_MON_A,$CEPH_MON_B,$CEPH_MON_C,$CEPH_MON_D" + wait_for_quorum 300 4 || return 1 + + run_mon $dir e --public-addr $CEPH_MON_E || return 1 + CEPH_ARGS="$BASE_CEPH_ARGS --mon-host=$CEPH_MON_A,$CEPH_MON_B,$CEPH_MON_C,$CEPH_MON_D,$CEPH_MON_E" + wait_for_quorum 300 5 || return 1 + + ceph mon set election_strategy connectivity + ceph mon add disallowed_leader e + + run_mgr $dir x || return 1 + run_mgr $dir y || return 1 + run_mgr $dir z || return 1 + + for osd in $(seq 0 $(expr $OSDS - 1)) + do + run_osd $dir $osd || return 1 + done + + for zone in iris pze + do + ceph osd crush add-bucket $zone zone + ceph osd crush move $zone root=default + done + + + ceph osd crush add-bucket node-2 host + ceph osd crush add-bucket node-3 host + ceph osd crush add-bucket node-4 host + ceph osd crush add-bucket node-5 host + + ceph osd crush move node-2 zone=iris + ceph osd crush move node-3 zone=iris + ceph osd crush move node-4 zone=pze + ceph osd crush move node-5 zone=pze + + ceph osd crush move osd.0 host=node-2 + ceph osd crush move osd.1 host=node-2 + ceph osd crush move osd.2 host=node-3 + ceph osd crush move osd.3 host=node-3 + ceph osd crush move osd.4 host=node-4 + ceph osd crush move osd.5 host=node-4 + ceph osd crush move osd.6 host=node-5 + ceph osd crush move osd.7 host=node-5 + + ceph mon set_location a zone=iris host=node-2 + ceph mon set_location b zone=iris host=node-3 + ceph mon set_location c zone=pze host=node-4 + ceph mon set_location d zone=pze host=node-5 + + hostname=$(hostname -s) + ceph osd crush remove $hostname || return 1 + ceph osd getcrushmap > crushmap || return 1 + crushtool --decompile crushmap > crushmap.txt || return 1 + sed 's/^# end crush map$//' crushmap.txt > crushmap_modified.txt || return 1 + cat >> crushmap_modified.txt << EOF +rule stretch_rule { + id 1 + type replicated + min_size 1 + max_size 10 + step take iris + step chooseleaf firstn 2 type host + step emit + step take pze + step chooseleaf firstn 2 type host + step emit +} + +# end crush map +EOF + + crushtool --compile crushmap_modified.txt -o crushmap.bin || return 1 + ceph osd setcrushmap -i crushmap.bin || return 1 + local stretched_poolname=stretched_rbdpool + ceph osd pool create $stretched_poolname 32 32 stretch_rule || return 1 + ceph osd pool set $stretched_poolname size 4 || return 1 + + sleep 3 + + ceph mon set_location e zone=arbiter host=node-1 + ceph mon enable_stretch_mode e stretch_rule zone + + kill_daemons $dir KILL mon.c || return 1 + kill_daemons $dir KILL mon.d || return 1 + + kill_daemons $dir KILL osd.4 || return 1 + kill_daemons $dir KILL osd.5 || return 1 + kill_daemons $dir KILL osd.6 || return 1 + kill_daemons $dir KILL osd.7 || return 1 + + ceph -s + + sleep 3 + + run_osd $dir 8 || return 1 + run_osd $dir 9 || return 1 + run_osd $dir 10 || return 1 + + ceph -s + + sleep 3 + + teardown $dir || return 1 +} +main mon-stretch-fail-recovery "$@" diff --git a/ceph/qa/standalone/mon-stretch/mon-stretch-uneven-crush-weights.sh b/ceph/qa/standalone/mon-stretch/mon-stretch-uneven-crush-weights.sh new file mode 100755 index 000000000..7e13f4076 --- /dev/null +++ b/ceph/qa/standalone/mon-stretch/mon-stretch-uneven-crush-weights.sh @@ -0,0 +1,145 @@ +#!/usr/bin/env bash + +source $CEPH_ROOT/qa/standalone/ceph-helpers.sh +function run() { + local dir=$1 + shift + + export CEPH_MON_A="127.0.0.1:7139" # git grep '\<7139\>' : there must be only one + export CEPH_MON_B="127.0.0.1:7141" # git grep '\<7141\>' : there must be only one + export CEPH_MON_C="127.0.0.1:7142" # git grep '\<7142\>' : there must be only one + export CEPH_MON_D="127.0.0.1:7143" # git grep '\<7143\>' : there must be only one + export CEPH_MON_E="127.0.0.1:7144" # git grep '\<7144\>' : there must be only one + export CEPH_ARGS + CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none " + + export BASE_CEPH_ARGS=$CEPH_ARGS + CEPH_ARGS+="--mon-host=$CEPH_MON_A" + + local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')} + for func in $funcs ; do + setup $dir || return 1 + $func $dir || return 1 + teardown $dir || return 1 + done +} +TEST_stretched_cluster_uneven_weight() { + local dir=$1 + local OSDS=4 + local weight=0.09000 + setup $dir || return 1 + + run_mon $dir a --public-addr $CEPH_MON_A || return 1 + wait_for_quorum 300 1 || return 1 + + run_mon $dir b --public-addr $CEPH_MON_B || return 1 + CEPH_ARGS="$BASE_CEPH_ARGS --mon-host=$CEPH_MON_A,$CEPH_MON_B" + wait_for_quorum 300 2 || return 1 + + run_mon $dir c --public-addr $CEPH_MON_C || return 1 + CEPH_ARGS="$BASE_CEPH_ARGS --mon-host=$CEPH_MON_A,$CEPH_MON_B,$CEPH_MON_C" + wait_for_quorum 300 3 || return 1 + + run_mon $dir d --public-addr $CEPH_MON_D || return 1 + CEPH_ARGS="$BASE_CEPH_ARGS --mon-host=$CEPH_MON_A,$CEPH_MON_B,$CEPH_MON_C,$CEPH_MON_D" + wait_for_quorum 300 4 || return 1 + + run_mon $dir e --public-addr $CEPH_MON_E || return 1 + CEPH_ARGS="$BASE_CEPH_ARGS --mon-host=$CEPH_MON_A,$CEPH_MON_B,$CEPH_MON_C,$CEPH_MON_D,$CEPH_MON_E" + wait_for_quorum 300 5 || return 1 + + ceph mon set election_strategy connectivity + ceph mon add disallowed_leader e + + run_mgr $dir x || return 1 + run_mgr $dir y || return 1 + run_mgr $dir z || return 1 + + for osd in $(seq 0 $(expr $OSDS - 1)) + do + run_osd $dir $osd || return 1 + done + + for zone in iris pze + do + ceph osd crush add-bucket $zone zone + ceph osd crush move $zone root=default + done + + ceph osd crush add-bucket node-2 host + ceph osd crush add-bucket node-3 host + ceph osd crush add-bucket node-4 host + ceph osd crush add-bucket node-5 host + + ceph osd crush move node-2 zone=iris + ceph osd crush move node-3 zone=iris + ceph osd crush move node-4 zone=pze + ceph osd crush move node-5 zone=pze + + ceph osd crush move osd.0 host=node-2 + ceph osd crush move osd.1 host=node-3 + ceph osd crush move osd.2 host=node-4 + ceph osd crush move osd.3 host=node-5 + + ceph mon set_location a zone=iris host=node-2 + ceph mon set_location b zone=iris host=node-3 + ceph mon set_location c zone=pze host=node-4 + ceph mon set_location d zone=pze host=node-5 + + hostname=$(hostname -s) + ceph osd crush remove $hostname || return 1 + ceph osd getcrushmap > crushmap || return 1 + crushtool --decompile crushmap > crushmap.txt || return 1 + sed 's/^# end crush map$//' crushmap.txt > crushmap_modified.txt || return 1 + cat >> crushmap_modified.txt << EOF +rule stretch_rule { + id 1 + type replicated + min_size 1 + max_size 10 + step take iris + step chooseleaf firstn 2 type host + step emit + step take pze + step chooseleaf firstn 2 type host + step emit +} +# end crush map +EOF + + crushtool --compile crushmap_modified.txt -o crushmap.bin || return 1 + ceph osd setcrushmap -i crushmap.bin || return 1 + local stretched_poolname=stretched_rbdpool + ceph osd pool create $stretched_poolname 32 32 stretch_rule || return 1 + ceph osd pool set $stretched_poolname size 4 || return 1 + + ceph mon set_location e zone=arbiter host=node-1 || return 1 + ceph mon enable_stretch_mode e stretch_rule zone || return 1 # Enter strech mode + + # reweight to a more round decimal. + ceph osd crush reweight osd.0 $weight + ceph osd crush reweight osd.1 $weight + ceph osd crush reweight osd.2 $weight + ceph osd crush reweight osd.3 $weight + + # Firstly, we test for stretch mode buckets != 2 + ceph osd crush add-bucket sham zone || return 1 + ceph osd crush move sham root=default || return 1 + wait_for_health "INCORRECT_NUM_BUCKETS_STRETCH_MODE" || return 1 + + ceph osd crush rm sham # clear the health warn + wait_for_health_gone "INCORRECT_NUM_BUCKETS_STRETCH_MODE" || return 1 + + # Next, we test for uneven weights across buckets + + ceph osd crush reweight osd.0 0.07000 + + wait_for_health "UNEVEN_WEIGHTS_STRETCH_MODE" || return 1 + + ceph osd crush reweight osd.0 $weight # clear the health warn + + wait_for_health_gone "UNEVEN_WEIGHTS_STRETCH_MODE" || return 1 + + teardown $dir || return 1 +} +main mon-stretched-cluster-uneven-weight "$@" \ No newline at end of file diff --git a/ceph/qa/suites/fs/functional/tasks/damage.yaml b/ceph/qa/suites/fs/functional/tasks/damage.yaml index ff8b3a58a..7703aee93 100644 --- a/ceph/qa/suites/fs/functional/tasks/damage.yaml +++ b/ceph/qa/suites/fs/functional/tasks/damage.yaml @@ -19,6 +19,7 @@ overrides: - MDS_READ_ONLY - force file system read-only - with standby daemon mds + - MDS abort because newly corrupt dentry tasks: - cephfs_test_runner: modules: diff --git a/ceph/qa/suites/fs/thrash/multifs/overrides/client-shutdown.yaml b/ceph/qa/suites/fs/thrash/multifs/overrides/client-shutdown.yaml new file mode 100644 index 000000000..30b2ea981 --- /dev/null +++ b/ceph/qa/suites/fs/thrash/multifs/overrides/client-shutdown.yaml @@ -0,0 +1,6 @@ +# Lengthen the timeout for thrashed MDS +overrides: + ceph: + conf: + client: + client_shutdown_timeout: 120 diff --git a/ceph/qa/suites/fs/thrash/workloads/overrides/client-shutdown.yaml b/ceph/qa/suites/fs/thrash/workloads/overrides/client-shutdown.yaml new file mode 100644 index 000000000..30b2ea981 --- /dev/null +++ b/ceph/qa/suites/fs/thrash/workloads/overrides/client-shutdown.yaml @@ -0,0 +1,6 @@ +# Lengthen the timeout for thrashed MDS +overrides: + ceph: + conf: + client: + client_shutdown_timeout: 120 diff --git a/ceph/qa/suites/fs/volumes/tasks/volumes/test/finisher_per_module.yaml b/ceph/qa/suites/fs/volumes/tasks/volumes/test/finisher_per_module.yaml new file mode 100644 index 000000000..ec8335fe0 --- /dev/null +++ b/ceph/qa/suites/fs/volumes/tasks/volumes/test/finisher_per_module.yaml @@ -0,0 +1,13 @@ +tasks: + - check-counter: + counters: + mgr: + - name: "finisher-volumes.complete_latency.avgcount" + min: 4 + - name: "finisher-volumes.queue_len" + expected_val: 0 + + - cephfs_test_runner: + fail_on_skip: false + modules: + - tasks.cephfs.test_volumes.TestPerModuleFinsherThread diff --git a/ceph/qa/suites/krbd/singleton-msgr-failures/% b/ceph/qa/suites/krbd/singleton-msgr-failures/% new file mode 100644 index 000000000..e69de29bb diff --git a/ceph/qa/suites/krbd/singleton/msgr-failures/.qa b/ceph/qa/suites/krbd/singleton-msgr-failures/.qa similarity index 100% rename from ceph/qa/suites/krbd/singleton/msgr-failures/.qa rename to ceph/qa/suites/krbd/singleton-msgr-failures/.qa diff --git a/ceph/qa/suites/krbd/singleton-msgr-failures/bluestore-bitmap.yaml b/ceph/qa/suites/krbd/singleton-msgr-failures/bluestore-bitmap.yaml new file mode 120000 index 000000000..a59cf5175 --- /dev/null +++ b/ceph/qa/suites/krbd/singleton-msgr-failures/bluestore-bitmap.yaml @@ -0,0 +1 @@ +.qa/objectstore/bluestore-bitmap.yaml \ No newline at end of file diff --git a/ceph/qa/suites/krbd/singleton-msgr-failures/conf.yaml b/ceph/qa/suites/krbd/singleton-msgr-failures/conf.yaml new file mode 100644 index 000000000..5e7ed992e --- /dev/null +++ b/ceph/qa/suites/krbd/singleton-msgr-failures/conf.yaml @@ -0,0 +1,7 @@ +overrides: + ceph: + conf: + global: + ms die on skipped message: false + client: + rbd default features: 37 diff --git a/ceph/qa/suites/krbd/singleton-msgr-failures/ms_mode$/.qa b/ceph/qa/suites/krbd/singleton-msgr-failures/ms_mode$/.qa new file mode 120000 index 000000000..a602a0353 --- /dev/null +++ b/ceph/qa/suites/krbd/singleton-msgr-failures/ms_mode$/.qa @@ -0,0 +1 @@ +../.qa/ \ No newline at end of file diff --git a/ceph/qa/suites/krbd/singleton-msgr-failures/ms_mode$/crc-rxbounce.yaml b/ceph/qa/suites/krbd/singleton-msgr-failures/ms_mode$/crc-rxbounce.yaml new file mode 100644 index 000000000..4d27d0113 --- /dev/null +++ b/ceph/qa/suites/krbd/singleton-msgr-failures/ms_mode$/crc-rxbounce.yaml @@ -0,0 +1,5 @@ +overrides: + ceph: + conf: + client: + rbd default map options: ms_mode=crc,rxbounce diff --git a/ceph/qa/suites/krbd/singleton-msgr-failures/ms_mode$/crc.yaml b/ceph/qa/suites/krbd/singleton-msgr-failures/ms_mode$/crc.yaml new file mode 100644 index 000000000..3b072578f --- /dev/null +++ b/ceph/qa/suites/krbd/singleton-msgr-failures/ms_mode$/crc.yaml @@ -0,0 +1,5 @@ +overrides: + ceph: + conf: + client: + rbd default map options: ms_mode=crc diff --git a/ceph/qa/suites/krbd/singleton-msgr-failures/ms_mode$/legacy-rxbounce.yaml b/ceph/qa/suites/krbd/singleton-msgr-failures/ms_mode$/legacy-rxbounce.yaml new file mode 100644 index 000000000..244e45cbc --- /dev/null +++ b/ceph/qa/suites/krbd/singleton-msgr-failures/ms_mode$/legacy-rxbounce.yaml @@ -0,0 +1,5 @@ +overrides: + ceph: + conf: + client: + rbd default map options: ms_mode=legacy,rxbounce diff --git a/ceph/qa/suites/krbd/singleton-msgr-failures/ms_mode$/legacy.yaml b/ceph/qa/suites/krbd/singleton-msgr-failures/ms_mode$/legacy.yaml new file mode 100644 index 000000000..0048dcb0c --- /dev/null +++ b/ceph/qa/suites/krbd/singleton-msgr-failures/ms_mode$/legacy.yaml @@ -0,0 +1,5 @@ +overrides: + ceph: + conf: + client: + rbd default map options: ms_mode=legacy diff --git a/ceph/qa/suites/krbd/singleton-msgr-failures/ms_mode$/secure.yaml b/ceph/qa/suites/krbd/singleton-msgr-failures/ms_mode$/secure.yaml new file mode 100644 index 000000000..a735db18d --- /dev/null +++ b/ceph/qa/suites/krbd/singleton-msgr-failures/ms_mode$/secure.yaml @@ -0,0 +1,5 @@ +overrides: + ceph: + conf: + client: + rbd default map options: ms_mode=secure diff --git a/ceph/qa/suites/krbd/singleton-msgr-failures/msgr-failures/.qa b/ceph/qa/suites/krbd/singleton-msgr-failures/msgr-failures/.qa new file mode 120000 index 000000000..a602a0353 --- /dev/null +++ b/ceph/qa/suites/krbd/singleton-msgr-failures/msgr-failures/.qa @@ -0,0 +1 @@ +../.qa/ \ No newline at end of file diff --git a/ceph/qa/suites/krbd/singleton/msgr-failures/few.yaml b/ceph/qa/suites/krbd/singleton-msgr-failures/msgr-failures/few.yaml similarity index 100% rename from ceph/qa/suites/krbd/singleton/msgr-failures/few.yaml rename to ceph/qa/suites/krbd/singleton-msgr-failures/msgr-failures/few.yaml diff --git a/ceph/qa/suites/krbd/singleton/msgr-failures/many.yaml b/ceph/qa/suites/krbd/singleton-msgr-failures/msgr-failures/many.yaml similarity index 100% rename from ceph/qa/suites/krbd/singleton/msgr-failures/many.yaml rename to ceph/qa/suites/krbd/singleton-msgr-failures/msgr-failures/many.yaml diff --git a/ceph/qa/suites/krbd/singleton-msgr-failures/tasks/.qa b/ceph/qa/suites/krbd/singleton-msgr-failures/tasks/.qa new file mode 120000 index 000000000..a602a0353 --- /dev/null +++ b/ceph/qa/suites/krbd/singleton-msgr-failures/tasks/.qa @@ -0,0 +1 @@ +../.qa/ \ No newline at end of file diff --git a/ceph/qa/suites/krbd/singleton/tasks/rbd_xfstests.yaml b/ceph/qa/suites/krbd/singleton-msgr-failures/tasks/rbd_xfstests.yaml similarity index 100% rename from ceph/qa/suites/krbd/singleton/tasks/rbd_xfstests.yaml rename to ceph/qa/suites/krbd/singleton-msgr-failures/tasks/rbd_xfstests.yaml diff --git a/ceph/qa/suites/krbd/singleton/conf.yaml b/ceph/qa/suites/krbd/singleton/conf.yaml index 5e7ed992e..41292fa81 100644 --- a/ceph/qa/suites/krbd/singleton/conf.yaml +++ b/ceph/qa/suites/krbd/singleton/conf.yaml @@ -2,6 +2,7 @@ overrides: ceph: conf: global: + mon warn on pool no app: false ms die on skipped message: false client: rbd default features: 37 diff --git a/ceph/qa/suites/krbd/singleton/tasks/krbd_watch_errors.yaml b/ceph/qa/suites/krbd/singleton/tasks/krbd_watch_errors.yaml new file mode 100644 index 000000000..5e30ef2ba --- /dev/null +++ b/ceph/qa/suites/krbd/singleton/tasks/krbd_watch_errors.yaml @@ -0,0 +1,19 @@ +overrides: + ceph: + conf: + global: + osd pool default size: 1 + osd: + osd shutdown pgref assert: true +roles: +- [mon.a, mgr.x, osd.0, client.0] + +tasks: +- install: + extra_system_packages: + - fio +- ceph: +- workunit: + clients: + all: + - rbd/krbd_watch_errors.sh diff --git a/ceph/qa/suites/orch/cephadm/dashboard/task/test_e2e.yaml b/ceph/qa/suites/orch/cephadm/dashboard/task/test_e2e.yaml index cb6ffb22f..ca7268ac6 100644 --- a/ceph/qa/suites/orch/cephadm/dashboard/task/test_e2e.yaml +++ b/ceph/qa/suites/orch/cephadm/dashboard/task/test_e2e.yaml @@ -1,3 +1,28 @@ +overrides: + ceph: + log-ignorelist: + - \(HOST_IN_MAINTENANCE\) + - \(OSD_DOWN\) + - \(MON_DOWN\) + - down + - overall HEALTH_ + - \(CEPHADM_STRAY_DAEMON\) + - stray daemon + - \(FS_DEGRADED\) + - \(MDS_FAILED\) + - \(MDS_DEGRADED\) + - \(FS_WITH_FAILED_MDS\) + - \(MDS_DAMAGE\) + - \(MDS_ALL_DOWN\) + - \(MDS_UP_LESS_THAN_MAX\) + - \(FS_INLINE_DATA_DEPRECATED\) + - \(PG_DEGRADED\) + - Degraded data redundancy + - \(PG_ + - acting + - MDS_INSUFFICIENT_STANDBY + - deprecated feature inline_data + - compat changed unexpectedly roles: # 3 osd roles on host.a is required for cephadm task. It checks if the cluster is healthy. # More daemons will be deployed on both hosts in e2e tests. diff --git a/ceph/qa/suites/orch/cephadm/mgr-nfs-upgrade/1-start.yaml b/ceph/qa/suites/orch/cephadm/mgr-nfs-upgrade/1-start.yaml index 2d9f09a4e..db4b26053 100644 --- a/ceph/qa/suites/orch/cephadm/mgr-nfs-upgrade/1-start.yaml +++ b/ceph/qa/suites/orch/cephadm/mgr-nfs-upgrade/1-start.yaml @@ -24,6 +24,21 @@ openstack: size: 10 # GB overrides: ceph: + log-ignorelist: + - slow requests + - \(PG_ + - PG_ + - \(CEPHADM_STRAY_DAEMON\) + - slow request + - \(MDS_ + - MDS_ + - osds down + - OSD_ + - \(OSD_ + - client + - FS_ + - \(FS_ + - degraded conf: osd: osd shutdown pgref assert: true diff --git a/ceph/qa/suites/orch/cephadm/osds/2-ops/repave-all.yaml b/ceph/qa/suites/orch/cephadm/osds/2-ops/repave-all.yaml index 16413aba8..fd1f9c6c3 100644 --- a/ceph/qa/suites/orch/cephadm/osds/2-ops/repave-all.yaml +++ b/ceph/qa/suites/orch/cephadm/osds/2-ops/repave-all.yaml @@ -1,3 +1,10 @@ +overrides: + ceph: + log-ignorelist: + - \(MON_DOWN\) + - \(OSD_DOWN\) + - \(PG_ + - but it is still running tasks: - cephadm.shell: host.a: diff --git a/ceph/qa/suites/orch/cephadm/osds/2-ops/rm-zap-add.yaml b/ceph/qa/suites/orch/cephadm/osds/2-ops/rm-zap-add.yaml index 09be72f11..fb925e88a 100644 --- a/ceph/qa/suites/orch/cephadm/osds/2-ops/rm-zap-add.yaml +++ b/ceph/qa/suites/orch/cephadm/osds/2-ops/rm-zap-add.yaml @@ -1,3 +1,10 @@ +overrides: + ceph: + log-ignorelist: + - \(MON_DOWN\) + - \(OSD_DOWN\) + - \(PG_ + - but it is still running tasks: - cephadm.shell: host.a: diff --git a/ceph/qa/suites/orch/cephadm/osds/2-ops/rm-zap-flag.yaml b/ceph/qa/suites/orch/cephadm/osds/2-ops/rm-zap-flag.yaml index 8f07f6d53..db612a16c 100644 --- a/ceph/qa/suites/orch/cephadm/osds/2-ops/rm-zap-flag.yaml +++ b/ceph/qa/suites/orch/cephadm/osds/2-ops/rm-zap-flag.yaml @@ -1,3 +1,10 @@ +overrides: + ceph: + log-ignorelist: + - \(MON_DOWN\) + - \(OSD_DOWN\) + - \(PG_ + - but it is still running tasks: - cephadm.shell: host.a: diff --git a/ceph/qa/suites/orch/cephadm/osds/2-ops/rm-zap-wait.yaml b/ceph/qa/suites/orch/cephadm/osds/2-ops/rm-zap-wait.yaml index 78161aa49..72c043e8b 100644 --- a/ceph/qa/suites/orch/cephadm/osds/2-ops/rm-zap-wait.yaml +++ b/ceph/qa/suites/orch/cephadm/osds/2-ops/rm-zap-wait.yaml @@ -1,3 +1,10 @@ +overrides: + ceph: + log-ignorelist: + - \(MON_DOWN\) + - \(OSD_DOWN\) + - \(PG_ + - but it is still running tasks: - cephadm.shell: host.a: diff --git a/ceph/qa/suites/orch/cephadm/osds/2-ops/rmdir-reactivate.yaml b/ceph/qa/suites/orch/cephadm/osds/2-ops/rmdir-reactivate.yaml index a971a02e4..501dea155 100644 --- a/ceph/qa/suites/orch/cephadm/osds/2-ops/rmdir-reactivate.yaml +++ b/ceph/qa/suites/orch/cephadm/osds/2-ops/rmdir-reactivate.yaml @@ -1,3 +1,11 @@ +overrides: + ceph: + log-ignorelist: + - \(MON_DOWN\) + - \(OSD_DOWN\) + - \(PG_ + - but it is still running + - \(CEPHADM_STRAY_DAEMON\) tasks: - cephadm.shell: host.a: diff --git a/ceph/qa/suites/orch/cephadm/smoke/start.yaml b/ceph/qa/suites/orch/cephadm/smoke/start.yaml index 77f493ca1..6a43d0f3d 100644 --- a/ceph/qa/suites/orch/cephadm/smoke/start.yaml +++ b/ceph/qa/suites/orch/cephadm/smoke/start.yaml @@ -1,3 +1,11 @@ +overrides: + ceph: + log-ignorelist: + - \(MON_DOWN\) + - \(PG_AVAILABILITY\) + - mon down + - mons down + - out of quorum tasks: - cephadm: conf: diff --git a/ceph/qa/suites/orch/cephadm/thrash/2-thrash.yaml b/ceph/qa/suites/orch/cephadm/thrash/2-thrash.yaml index 05e0f8e76..2f45d7676 100644 --- a/ceph/qa/suites/orch/cephadm/thrash/2-thrash.yaml +++ b/ceph/qa/suites/orch/cephadm/thrash/2-thrash.yaml @@ -3,6 +3,23 @@ overrides: log-ignorelist: - but it is still running - objects unfound and apparently lost + - \(MON_DOWN\) + - \(OSDMAP_FLAGS\) + - flag\(s\) set + - \(CACHE_POOL_NO_HIT_SET\) + - \(CACHE_ + - \(PG_ + - \(OSD_ + - \(POOL_ + - \(CEPHADM_STRAY_DAEMON\) + - PG_ + - CACHE_ + - degraded + - backfill + - mons down + - OSD_ + - is down + - acting conf: osd: osd debug reject backfill probability: .3 diff --git a/ceph/qa/suites/orch/cephadm/upgrade/4-wait.yaml b/ceph/qa/suites/orch/cephadm/upgrade/4-wait.yaml index 58afe00c5..ddba90e93 100644 --- a/ceph/qa/suites/orch/cephadm/upgrade/4-wait.yaml +++ b/ceph/qa/suites/orch/cephadm/upgrade/4-wait.yaml @@ -1,3 +1,14 @@ +overrides: + ceph: + log-ignorelist: + - \(MON_DOWN\) + - \(PG_ + - mons down + - pg inactive + - out of quorum + - \(OSD_ + - osds down + - osd down tasks: - cephadm.shell: env: [sha1] diff --git a/ceph/qa/suites/orch/cephadm/workunits/task/test_nfs.yaml b/ceph/qa/suites/orch/cephadm/workunits/task/test_nfs.yaml index 8448c1a2f..afa9deecb 100644 --- a/ceph/qa/suites/orch/cephadm/workunits/task/test_nfs.yaml +++ b/ceph/qa/suites/orch/cephadm/workunits/task/test_nfs.yaml @@ -1,3 +1,9 @@ +overrides: + ceph: + log-ignorelist: + - Replacing daemon mds + - FS_DEGRADED + - \(CEPHADM_STRAY_DAEMON\) roles: - - host.a - osd.0 diff --git a/ceph/qa/suites/orch/cephadm/workunits/task/test_orch_cli.yaml b/ceph/qa/suites/orch/cephadm/workunits/task/test_orch_cli.yaml index ec65fb116..a1b8a4c0f 100644 --- a/ceph/qa/suites/orch/cephadm/workunits/task/test_orch_cli.yaml +++ b/ceph/qa/suites/orch/cephadm/workunits/task/test_orch_cli.yaml @@ -1,3 +1,10 @@ +overrides: + ceph: + log-ignorelist: + - \(MON_DOWN\) + - \(OSD_DOWN\) + - \(CEPHADM_PAUSED\) + - mons down roles: - - host.a - osd.0 diff --git a/ceph/qa/suites/orch/cephadm/workunits/task/test_orch_cli_mon.yaml b/ceph/qa/suites/orch/cephadm/workunits/task/test_orch_cli_mon.yaml index 2a33dc839..5e8d286db 100644 --- a/ceph/qa/suites/orch/cephadm/workunits/task/test_orch_cli_mon.yaml +++ b/ceph/qa/suites/orch/cephadm/workunits/task/test_orch_cli_mon.yaml @@ -1,3 +1,10 @@ +overrides: + ceph: + log-ignorelist: + - \(MON_DOWN\) + - mons down + - \(MGR_DOWN\) + - out of quorum roles: - - host.a - osd.0 diff --git a/ceph/qa/suites/rados/basic/tasks/rados_api_tests.yaml b/ceph/qa/suites/rados/basic/tasks/rados_api_tests.yaml index a54e03b1a..47b293e4c 100644 --- a/ceph/qa/suites/rados/basic/tasks/rados_api_tests.yaml +++ b/ceph/qa/suites/rados/basic/tasks/rados_api_tests.yaml @@ -11,6 +11,15 @@ overrides: - \(POOL_APP_NOT_ENABLED\) - \(PG_AVAILABILITY\) - \(PG_DEGRADED\) + - \(MON_DOWN\) + - \(CEPHADM_STRAY_DAEMON\) + - missing hit_sets + - do not have an application enabled + - application not enabled on pool + - pool application + - mons down + - out of quorum + - needs hit_set_type to be set but it is not conf: client: debug ms: 1 diff --git a/ceph/qa/suites/rados/basic/tasks/rados_cls_all.yaml b/ceph/qa/suites/rados/basic/tasks/rados_cls_all.yaml index 8f5b79027..8896ccb44 100644 --- a/ceph/qa/suites/rados/basic/tasks/rados_cls_all.yaml +++ b/ceph/qa/suites/rados/basic/tasks/rados_cls_all.yaml @@ -2,6 +2,7 @@ overrides: ceph: log-ignorelist: - \(PG_AVAILABILITY\) + - \(POOL_APP_NOT_ENABLED\) conf: osd: osd_class_load_list: "*" diff --git a/ceph/qa/suites/rados/basic/tasks/rados_python.yaml b/ceph/qa/suites/rados/basic/tasks/rados_python.yaml index 965909450..f89b07ca1 100644 --- a/ceph/qa/suites/rados/basic/tasks/rados_python.yaml +++ b/ceph/qa/suites/rados/basic/tasks/rados_python.yaml @@ -8,6 +8,13 @@ overrides: - \(OSD_ - \(OBJECT_ - \(POOL_APP_NOT_ENABLED\) + - \(MON_DOWN\) + - mons down + - application not enabled on pool + - do not have an application enabled + - pool application + - out of quorum + - needs hit_set_type to be set but it is not tasks: - workunit: clients: diff --git a/ceph/qa/suites/rados/mgr/tasks/per_module_finisher_stats.yaml b/ceph/qa/suites/rados/mgr/tasks/per_module_finisher_stats.yaml new file mode 100644 index 000000000..a2f087ab1 --- /dev/null +++ b/ceph/qa/suites/rados/mgr/tasks/per_module_finisher_stats.yaml @@ -0,0 +1,43 @@ +tasks: + - install: + - ceph: + wait-for-scrub: false + - check-counter: + counters: + mgr: + - name: "finisher-balancer.complete_latency.avgcount" + min: 1 + - name: "finisher-balancer.queue_len" + expected_val: 0 + - name: "finisher-crash.complete_latency.avgcount" + min: 2 + - name: "finisher-crash.queue_len" + expected_val: 0 + - name: "finisher-devicehealth.complete_latency.avgcount" + min: 1 + - name: "finisher-devicehealth.queue_len" + expected_val: 0 + - name: "finisher-iostat.complete_latency.avgcount" + min: 1 + - name: "finisher-iostat.queue_len" + expected_val: 0 + - name: "finisher-pg_autoscaler.complete_latency.avgcount" + min: 1 + - name: "finisher-pg_autoscaler.queue_len" + expected_val: 0 + - name: "finisher-progress.complete_latency.avgcount" + min: 2 + - name: "finisher-progress.queue_len" + expected_val: 0 + - name: "finisher-status.complete_latency.avgcount" + min: 2 + - name: "finisher-status.queue_len" + expected_val: 0 + - name: "finisher-telemetry.complete_latency.avgcount" + min: 1 + - name: "finisher-telemetry.queue_len" + expected_val: 0 + - workunit: + clients: + client.0: + - mgr/test_per_module_finisher.sh diff --git a/ceph/qa/suites/rados/mgr/tasks/workunits.yaml b/ceph/qa/suites/rados/mgr/tasks/workunits.yaml index 6074de0ed..791adc272 100644 --- a/ceph/qa/suites/rados/mgr/tasks/workunits.yaml +++ b/ceph/qa/suites/rados/mgr/tasks/workunits.yaml @@ -13,4 +13,4 @@ tasks: - workunit: clients: client.0: - - mgr \ No newline at end of file + - mgr/test_localpool.sh diff --git a/ceph/qa/suites/rados/standalone/workloads/mon-stretch.yaml b/ceph/qa/suites/rados/standalone/workloads/mon-stretch.yaml new file mode 100644 index 000000000..d039126c5 --- /dev/null +++ b/ceph/qa/suites/rados/standalone/workloads/mon-stretch.yaml @@ -0,0 +1,18 @@ +roles: +- - mon.a + - mgr.x + - osd.0 + - osd.1 + - osd.2 + - client.0 +openstack: + - volumes: # attached to each instance + count: 3 + size: 10 # GB +tasks: +- install: +- workunit: + basedir: qa/standalone + clients: + all: + - mon-stretch \ No newline at end of file diff --git a/ceph/qa/suites/rados/verify/tasks/rados_cls_all.yaml b/ceph/qa/suites/rados/verify/tasks/rados_cls_all.yaml index 163bc2c08..0236326f3 100644 --- a/ceph/qa/suites/rados/verify/tasks/rados_cls_all.yaml +++ b/ceph/qa/suites/rados/verify/tasks/rados_cls_all.yaml @@ -4,6 +4,8 @@ overrides: osd: osd_class_load_list: "*" osd_class_default_list: "*" + log-ignorelist: + - \(POOL_APP_NOT_ENABLED\) tasks: - workunit: clients: diff --git a/ceph/qa/suites/rbd/cli/workloads/rbd_support_module_recovery.yaml b/ceph/qa/suites/rbd/cli/workloads/rbd_support_module_recovery.yaml new file mode 100644 index 000000000..aa4d0001f --- /dev/null +++ b/ceph/qa/suites/rbd/cli/workloads/rbd_support_module_recovery.yaml @@ -0,0 +1,13 @@ +overrides: + ceph: + conf: + mgr: + debug rbd: 20 +tasks: +- install: + extra_system_packages: + - fio +- workunit: + clients: + client.0: + - rbd/rbd_support_module_recovery.sh diff --git a/ceph/qa/suites/rgw/verify/tasks/bucket-check.yaml b/ceph/qa/suites/rgw/verify/tasks/bucket-check.yaml new file mode 100644 index 000000000..4955d41c6 --- /dev/null +++ b/ceph/qa/suites/rgw/verify/tasks/bucket-check.yaml @@ -0,0 +1,5 @@ +tasks: +- workunit: + clients: + client.0: + - rgw/run-bucket-check.sh diff --git a/ceph/qa/suites/upgrade/nautilus-x/parallel/2-workload/rgw_ragweed_prepare.yaml b/ceph/qa/suites/upgrade/nautilus-x/parallel/2-workload/rgw_ragweed_prepare.yaml index 5dbffc73d..207b68cc3 100644 --- a/ceph/qa/suites/upgrade/nautilus-x/parallel/2-workload/rgw_ragweed_prepare.yaml +++ b/ceph/qa/suites/upgrade/nautilus-x/parallel/2-workload/rgw_ragweed_prepare.yaml @@ -6,7 +6,7 @@ workload: - sequential: - ragweed: client.1: - default-branch: ceph-pacific + default-branch: ceph-nautilus rgw_server: client.1 stages: prepare - print: "**** done rgw ragweed prepare 2-workload" diff --git a/ceph/qa/suites/upgrade/nautilus-x/parallel/5-final-workload/rgw_ragweed_check.yaml b/ceph/qa/suites/upgrade/nautilus-x/parallel/5-final-workload/rgw_ragweed_check.yaml index 2e94f2503..c91d91f6e 100644 --- a/ceph/qa/suites/upgrade/nautilus-x/parallel/5-final-workload/rgw_ragweed_check.yaml +++ b/ceph/qa/suites/upgrade/nautilus-x/parallel/5-final-workload/rgw_ragweed_check.yaml @@ -5,7 +5,7 @@ rgw-final-workload: full_sequential: - ragweed: client.1: - default-branch: ceph-pacific + default-branch: ceph-nautilus rgw_server: client.1 stages: check - print: "**** done ragweed check 4-final-workload" diff --git a/ceph/qa/suites/upgrade/octopus-x/parallel-no-cephadm/5-final-workload/rgw_ragweed_check.yaml b/ceph/qa/suites/upgrade/octopus-x/parallel-no-cephadm/5-final-workload/rgw_ragweed_check.yaml index 2e94f2503..6f1df8380 100644 --- a/ceph/qa/suites/upgrade/octopus-x/parallel-no-cephadm/5-final-workload/rgw_ragweed_check.yaml +++ b/ceph/qa/suites/upgrade/octopus-x/parallel-no-cephadm/5-final-workload/rgw_ragweed_check.yaml @@ -5,7 +5,7 @@ rgw-final-workload: full_sequential: - ragweed: client.1: - default-branch: ceph-pacific + default-branch: ceph-octopus rgw_server: client.1 stages: check - print: "**** done ragweed check 4-final-workload" diff --git a/ceph/qa/suites/upgrade/pacific-p2p/pacific-p2p-parallel/point-to-point-upgrade.yaml b/ceph/qa/suites/upgrade/pacific-p2p/pacific-p2p-parallel/point-to-point-upgrade.yaml index ebaf84199..c4f6e5077 100644 --- a/ceph/qa/suites/upgrade/pacific-p2p/pacific-p2p-parallel/point-to-point-upgrade.yaml +++ b/ceph/qa/suites/upgrade/pacific-p2p/pacific-p2p-parallel/point-to-point-upgrade.yaml @@ -123,7 +123,7 @@ workload_pacific: - rados/test.sh - cls env: - CLS_RBD_GTEST_FILTER: '*:-TestClsRbd.snapshots_namespaces' + CLS_RBD_GTEST_FILTER: '*:-TestClsRbd.mirror_snapshot' - print: "**** done rados/test.sh & cls workload_pacific" - sequential: - rgw: [client.0] diff --git a/ceph/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/4-workload/rbd-cls.yaml b/ceph/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/4-workload/rbd-cls.yaml index caaac875c..1233f70b0 100644 --- a/ceph/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/4-workload/rbd-cls.yaml +++ b/ceph/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/4-workload/rbd-cls.yaml @@ -7,4 +7,6 @@ stress-tasks: clients: client.0: - cls/test_cls_rbd.sh + env: + CLS_RBD_GTEST_FILTER: '*:-TestClsRbd.mirror_snapshot' - print: "**** done cls/test_cls_rbd.sh 4-workload" diff --git a/ceph/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/6-final-workload/rbd-python.yaml b/ceph/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/6-final-workload/rbd-python.yaml index 4ca4e7485..7e2e98adf 100644 --- a/ceph/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/6-final-workload/rbd-python.yaml +++ b/ceph/qa/suites/upgrade/pacific-p2p/pacific-p2p-stress-split/6-final-workload/rbd-python.yaml @@ -3,7 +3,7 @@ meta: librbd python api tests tasks: - workunit: - tag: v16.2.7 + branch: pacific clients: client.0: - rbd/test_librbd_python.sh diff --git a/ceph/qa/tasks/ceph_manager.py b/ceph/qa/tasks/ceph_manager.py index 4bf22e2f5..9d47832b3 100644 --- a/ceph/qa/tasks/ceph_manager.py +++ b/ceph/qa/tasks/ceph_manager.py @@ -232,6 +232,7 @@ class OSDThrasher(Thrasher): self.chance_thrash_pg_upmap_items = self.config.get('chance_thrash_pg_upmap', 1.0) self.random_eio = self.config.get('random_eio') self.chance_force_recovery = self.config.get('chance_force_recovery', 0.3) + self.chance_reset_purged_snaps_last = self.config.get('chance_reset_purged_snaps_last', 0.3) num_osds = self.in_osds + self.out_osds self.max_pgs = self.config.get("max_pgs_per_pool_osd", 1200) * len(num_osds) @@ -798,6 +799,19 @@ class OSDThrasher(Thrasher): else: self.cancel_force_recovery() + def reset_purged_snaps_last(self): + """ + Run reset_purged_snaps_last + """ + self.log('reset_purged_snaps_last') + for osd in self.in_osds: + try: + self.ceph_manager.raw_cluster_cmd( + 'tell', "osd.%s" % (str(osd)), + 'reset_purged_snaps_last') + except CommandFailedError: + self.log('Failed to reset_purged_snaps_last, ignoring') + def all_up(self): """ Make sure all osds are up and not out. @@ -1248,6 +1262,8 @@ class OSDThrasher(Thrasher): actions.append((self.thrash_pg_upmap_items, self.chance_thrash_pg_upmap_items,)) if self.chance_force_recovery > 0: actions.append((self.force_cancel_recovery, self.chance_force_recovery)) + if self.chance_reset_purged_snaps_last > 0: + actions.append((self.reset_purged_snaps_last, self.chance_reset_purged_snaps_last)) for key in ['heartbeat_inject_failure', 'filestore_inject_stall']: for scenario in [ diff --git a/ceph/qa/tasks/cephadm.conf b/ceph/qa/tasks/cephadm.conf index bd1ab821e..2a83b083d 100644 --- a/ceph/qa/tasks/cephadm.conf +++ b/ceph/qa/tasks/cephadm.conf @@ -2,6 +2,8 @@ # make logging friendly to teuthology log_to_file = true log_to_stderr = false +log to journald = false +mon cluster log to file = true mon cluster log file level = debug mon clock drift allowed = 1.000 diff --git a/ceph/qa/tasks/cephfs/mount.py b/ceph/qa/tasks/cephfs/mount.py index b6c164281..0f18702f9 100644 --- a/ceph/qa/tasks/cephfs/mount.py +++ b/ceph/qa/tasks/cephfs/mount.py @@ -811,7 +811,7 @@ class CephFSMount(object): )) p.wait() - def open_background(self, basename="background_file", write=True): + def open_background(self, basename="background_file", write=True, content="content"): """ Open a file for writing, then block such that the client will hold a capability. @@ -828,12 +828,11 @@ class CephFSMount(object): import time with open("{path}", 'w') as f: - f.write('content') + f.write("{content}") f.flush() - f.write('content2') while True: time.sleep(1) - """).format(path=path) + """).format(path=path, content=content) else: pyscript = dedent(""" import time @@ -849,7 +848,10 @@ class CephFSMount(object): # This wait would not be sufficient if the file had already # existed, but it's simple and in practice users of open_background # are not using it on existing files. - self.wait_for_visible(basename) + if write: + self.wait_for_visible(basename, size=len(content)) + else: + self.wait_for_visible(basename) return rproc @@ -887,19 +889,27 @@ class CephFSMount(object): if nr_links == 2: return - def wait_for_visible(self, basename="background_file", timeout=30): + def wait_for_visible(self, basename="background_file", size=None, timeout=30): i = 0 + args = ['stat'] + if size is not None: + args += ['--printf=%s'] + args += [os.path.join(self.hostfs_mntpt, basename)] while i < timeout: - r = self.client_remote.run(args=[ - 'stat', os.path.join(self.hostfs_mntpt, basename) - ], check_status=False) - if r.exitstatus == 0: - log.debug("File {0} became visible from {1} after {2}s".format( - basename, self.client_id, i)) - return - else: - time.sleep(1) - i += 1 + p = self.client_remote.run(args=args, stdout=StringIO(), check_status=False) + if p.exitstatus == 0: + if size is not None: + s = p.stdout.getvalue().strip() + if int(s) == size: + log.info(f"File {basename} became visible with size {size} from {self.client_id} after {i}s") + return + else: + log.error(f"File {basename} became visible but with size {int(s)} not {size}") + else: + log.info(f"File {basename} became visible from {self.client_id} after {i}s") + return + time.sleep(1) + i += 1 raise RuntimeError("Timed out after {0}s waiting for {1} to become visible from {2}".format( i, basename, self.client_id)) diff --git a/ceph/qa/tasks/cephfs/test_cephfs_shell.py b/ceph/qa/tasks/cephfs/test_cephfs_shell.py index 8995d260f..477d0ae09 100644 --- a/ceph/qa/tasks/cephfs/test_cephfs_shell.py +++ b/ceph/qa/tasks/cephfs/test_cephfs_shell.py @@ -1,6 +1,8 @@ """ -Before running this testsuite, add path to cephfs-shell module to $PATH and -export $PATH. +NOTE: For running this tests locally (using vstart_runner.py), export the +path to src/tools/cephfs/shell/cephfs-shell module to $PATH. Running +"export PATH=$PATH:$(cd ../src/tools/cephfs/shell && pwd)" from the build dir +will update the environment without hassles of typing the path correctly. """ from io import StringIO from os import path diff --git a/ceph/qa/tasks/cephfs/test_client_limits.py b/ceph/qa/tasks/cephfs/test_client_limits.py index 7d9cf2a76..74cb17678 100644 --- a/ceph/qa/tasks/cephfs/test_client_limits.py +++ b/ceph/qa/tasks/cephfs/test_client_limits.py @@ -9,7 +9,9 @@ from textwrap import dedent from tasks.ceph_test_case import TestTimeoutError from tasks.cephfs.cephfs_test_case import CephFSTestCase, needs_trimming from tasks.cephfs.fuse_mount import FuseMount +from teuthology.exceptions import CommandFailedError import os +from io import StringIO log = logging.getLogger(__name__) @@ -157,29 +159,49 @@ class TestClientLimits(CephFSTestCase): a fraction of second (0.5) by default when throttling condition is met. """ - max_caps_per_client = 500 - cap_acquisition_throttle = 250 + subdir_count = 4 + files_per_dir = 25 - self.config_set('mds', 'mds_max_caps_per_client', max_caps_per_client) - self.config_set('mds', 'mds_session_cap_acquisition_throttle', cap_acquisition_throttle) + # throttle in a way so that two dir reads are already hitting it. + throttle_value = (files_per_dir * 3) // 2 - # Create 1500 files split across 6 directories, 250 each. - for i in range(1, 7): - self.mount_a.create_n_files("dir{0}/file".format(i), cap_acquisition_throttle, sync=True) + # activate throttling logic by setting max per client to a low value + self.config_set('mds', 'mds_max_caps_per_client', 1) + self.config_set('mds', 'mds_session_cap_acquisition_throttle', throttle_value) + + # Create files split across {subdir_count} directories, {per_dir_count} in each dir + for i in range(1, subdir_count+1): + self.mount_a.create_n_files("dir{0}/file".format(i), files_per_dir, sync=True) mount_a_client_id = self.mount_a.get_global_id() - # recursive readdir - self.mount_a.run_shell_payload("find | wc") - - # validate cap_acquisition decay counter after readdir to exceed throttle count i.e 250 - cap_acquisition_value = self.get_session(mount_a_client_id)['cap_acquisition']['value'] - self.assertGreaterEqual(cap_acquisition_value, cap_acquisition_throttle) + # recursive readdir. macOs wants an explicit directory for `find`. + proc = self.mount_a.run_shell_payload("find . | wc", stderr=StringIO()) + # return code may be None if the command got interrupted + self.assertTrue(proc.returncode is None or proc.returncode == 0, proc.stderr.getvalue()) # validate the throttle condition to be hit atleast once cap_acquisition_throttle_hit_count = self.perf_dump()['mds_server']['cap_acquisition_throttle'] self.assertGreaterEqual(cap_acquisition_throttle_hit_count, 1) + # validate cap_acquisition decay counter after readdir to NOT exceed the throttle value + # plus one batch that could have been taken immediately before querying + # assuming the batch is equal to the per dir file count. + cap_acquisition_value = self.get_session(mount_a_client_id)['cap_acquisition']['value'] + self.assertLessEqual(cap_acquisition_value, files_per_dir + throttle_value) + + # make sure that the throttle was reported in the events + def historic_ops_have_event(expected_event): + ops_dump = self.fs.rank_tell(['dump_historic_ops']) + # reverse the events and the ops assuming that later ops would be throttled + for op in reversed(ops_dump['ops']): + for ev in reversed(op.get('type_data', {}).get('events', [])): + if ev['event'] == expected_event: + return True + return False + + self.assertTrue(historic_ops_have_event('cap_acquisition_throttle')) + def test_client_release_bug(self): """ When a client has a bug (which we will simulate) preventing it from releasing caps, @@ -219,6 +241,55 @@ class TestClientLimits(CephFSTestCase): self.fs.mds_asok(['session', 'evict', "%s" % mount_a_client_id]) rproc.wait() + def test_client_blocklisted_oldest_tid(self): + """ + that a client is blocklisted when its encoded session metadata exceeds the + configured threshold (due to ever growing `completed_requests` caused due + to an unidentified bug (in the client or the MDS)). + """ + + # num of requests client issues + max_requests = 10000 + + # The debug hook to inject the failure only exists in the fuse client + if not isinstance(self.mount_a, FuseMount): + self.skipTest("Require FUSE client to inject client release failure") + + self.config_set('client', 'client inject fixed oldest tid', 'true') + self.mount_a.teardown() + self.mount_a.mount_wait() + + self.config_set('mds', 'mds_max_completed_requests', max_requests); + + # Create lots of files + self.mount_a.create_n_files("testdir/file1", max_requests + 100) + + # Create a few files synchronously. This makes sure previous requests are completed + self.mount_a.create_n_files("testdir/file2", 5, True) + + # Wait for the health warnings. Assume mds can handle 10 request per second at least + self.wait_for_health("MDS_CLIENT_OLDEST_TID", max_requests // 10, check_in_detail=str(self.mount_a.client_id)) + + # set the threshold low so that it has a high probability of + # hitting. + self.config_set('mds', 'mds_session_metadata_threshold', 5000); + + # Create lot many files synchronously. This would hit the session metadata threshold + # causing the client to get blocklisted. + with self.assertRaises(CommandFailedError): + self.mount_a.create_n_files("testdir/file2", 100000, True) + + self.mds_cluster.is_addr_blocklisted(self.mount_a.get_global_addr()) + # the mds should bump up the relevant perf counter + pd = self.perf_dump() + self.assertGreater(pd['mds_sessions']['mdthresh_evicted'], 0) + + # reset the config + self.config_set('client', 'client inject fixed oldest tid', 'false') + + self.mount_a.kill_cleanup() + self.mount_a.mount_wait() + def test_client_oldest_tid(self): """ When a client does not advance its oldest tid, the MDS should notice that diff --git a/ceph/qa/tasks/cephfs/test_client_recovery.py b/ceph/qa/tasks/cephfs/test_client_recovery.py index 50204a6c2..082e7fa84 100644 --- a/ceph/qa/tasks/cephfs/test_client_recovery.py +++ b/ceph/qa/tasks/cephfs/test_client_recovery.py @@ -10,8 +10,10 @@ from textwrap import dedent import time import distutils.version as version import re +import string import os +from teuthology import contextutil from teuthology.orchestra import run from teuthology.orchestra.run import CommandFailedError from tasks.cephfs.fuse_mount import FuseMount @@ -221,8 +223,10 @@ class TestClientRecovery(CephFSTestCase): # Capability release from stale session # ===================================== if write: - cap_holder = self.mount_a.open_background() + content = ''.join(random.choices(string.ascii_uppercase + string.digits, k=16)) + cap_holder = self.mount_a.open_background(content=content) else: + content = '' self.mount_a.run_shell(["touch", "background_file"]) self.mount_a.umount_wait() self.mount_a.mount_wait() @@ -233,7 +237,7 @@ class TestClientRecovery(CephFSTestCase): # Wait for the file to be visible from another client, indicating # that mount_a has completed its network ops - self.mount_b.wait_for_visible() + self.mount_b.wait_for_visible(size=len(content)) # Simulate client death self.mount_a.suspend_netns() @@ -264,11 +268,9 @@ class TestClientRecovery(CephFSTestCase): "Capability handover took {0}, expected approx {1}".format( cap_waited, session_timeout )) - - self.mount_a._kill_background(cap_holder) finally: - # teardown() doesn't quite handle this case cleanly, so help it out - self.mount_a.resume_netns() + self.mount_a.resume_netns() # allow the mount to recover otherwise background proc is unkillable + self.mount_a._kill_background(cap_holder) def test_stale_read_caps(self): self._test_stale_caps(False) @@ -319,9 +321,9 @@ class TestClientRecovery(CephFSTestCase): cap_waited, session_timeout / 2.0 )) - self.mount_a._kill_background(cap_holder) finally: - self.mount_a.resume_netns() + self.mount_a.resume_netns() # allow the mount to recover otherwise background proc is unkillable + self.mount_a._kill_background(cap_holder) def test_trim_caps(self): # Trim capability when reconnecting MDS @@ -387,7 +389,6 @@ class TestClientRecovery(CephFSTestCase): self.mount_b.check_filelock(do_flock=flockable) - # Tear down the background process self.mount_a._kill_background(lock_holder) def test_filelock_eviction(self): @@ -416,7 +417,6 @@ class TestClientRecovery(CephFSTestCase): # succeed self.wait_until_true(lambda: lock_taker.finished, timeout=10) finally: - # Tear down the background process self.mount_a._kill_background(lock_holder) # teardown() doesn't quite handle this case cleanly, so help it out @@ -751,24 +751,27 @@ class TestClientOnLaggyOSD(CephFSTestCase): # it takes time to have laggy clients entries in cluster log, # wait for 6 minutes to see if it is visible, finally restart # the client - tries = 6 - while True: - try: - with self.assert_cluster_log("1 client(s) laggy due to laggy OSDs", - timeout=55): - # make sure clients weren't evicted - self.assert_session_count(2) - break - except AssertionError: - tries -= 1 - if tries: - continue - raise + with contextutil.safe_while(sleep=5, tries=6) as proceed: + while proceed(): + try: + with self.assert_cluster_log("1 client(s) laggy due to" + " laggy OSDs", + timeout=55): + # make sure clients weren't evicted + self.assert_session_count(2) + break + except (AssertionError, CommandFailedError) as e: + log.debug(f'{e}, retrying') + + # clear lagginess, expect to get the warning cleared and make sure + # client gets evicted + self.clear_laggy_params(osd) + self.wait_for_health_clear(60) + self.assert_session_count(1) finally: self.mount_a.kill_cleanup() self.mount_a.mount_wait() self.mount_a.create_destroy() - self.clear_laggy_params(osd) def test_client_eviction_if_config_is_unset(self): """ @@ -800,6 +803,11 @@ class TestClientOnLaggyOSD(CephFSTestCase): time.sleep(session_timeout) self.assert_session_count(1) + + # make sure warning wasn't seen in cluster log + with self.assert_cluster_log("laggy due to laggy OSDs", + timeout=120, present=False): + pass finally: self.mount_a.kill_cleanup() self.mount_a.mount_wait() diff --git a/ceph/qa/tasks/cephfs/test_damage.py b/ceph/qa/tasks/cephfs/test_damage.py index d83187017..bfaa23453 100644 --- a/ceph/qa/tasks/cephfs/test_damage.py +++ b/ceph/qa/tasks/cephfs/test_damage.py @@ -608,8 +608,9 @@ class TestDamage(CephFSTestCase): self.fs.flush() self.config_set("mds", "mds_inject_rename_corrupt_dentry_first", "1.0") time.sleep(5) # for conf to percolate - p = self.mount_a.run_shell_payload("timeout 60 mv a/b a/z", wait=False) - self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(), timeout=self.fs.beacon_timeout) + with self.assert_cluster_log("MDS abort because newly corrupt dentry"): + p = self.mount_a.run_shell_payload("timeout 60 mv a/b a/z", wait=False) + self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(), timeout=self.fs.beacon_timeout) self.config_rm("mds", "mds_inject_rename_corrupt_dentry_first") self.fs.rank_freeze(False, rank=0) self.delete_mds_coredump(rank0['name']) @@ -642,9 +643,10 @@ class TestDamage(CephFSTestCase): rank0 = self.fs.get_rank() self.fs.rank_freeze(True, rank=0) # so now we want to trigger commit but this will crash, so: - c = ['--connect-timeout=60', 'tell', f"mds.{fscid}:0", "flush", "journal"] - p = self.ceph_cluster.mon_manager.run_cluster_cmd(args=c, wait=False, timeoutcmd=30) - self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(), timeout=self.fs.beacon_timeout) + with self.assert_cluster_log("MDS abort because newly corrupt dentry"): + c = ['--connect-timeout=60', 'tell', f"mds.{fscid}:0", "flush", "journal"] + p = self.ceph_cluster.mon_manager.run_cluster_cmd(args=c, wait=False, timeoutcmd=30) + self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(), timeout=self.fs.beacon_timeout) self.config_rm("mds", "mds_inject_journal_corrupt_dentry_first") self.fs.rank_freeze(False, rank=0) self.delete_mds_coredump(rank0['name']) diff --git a/ceph/qa/tasks/cephfs/test_failover.py b/ceph/qa/tasks/cephfs/test_failover.py index 63de9a2c1..2ea58ab3a 100644 --- a/ceph/qa/tasks/cephfs/test_failover.py +++ b/ceph/qa/tasks/cephfs/test_failover.py @@ -14,9 +14,12 @@ class TestClusterAffinity(CephFSTestCase): CLIENTS_REQUIRED = 0 MDSS_REQUIRED = 4 - def _verify_join_fs(self, target, status=None): + def _verify_join_fs(self, target, status=None, fs=None): + fs_select = fs + if fs_select is None: + fs_select = self.fs if status is None: - status = self.fs.wait_for_daemons(timeout=30) + status = fs_select.wait_for_daemons(timeout=30) log.debug("%s", status) target = sorted(target, key=operator.itemgetter('name')) log.info("target = %s", target) @@ -37,11 +40,14 @@ class TestClusterAffinity(CephFSTestCase): return self.fail("no entity") - def _verify_init(self): - status = self.fs.status() + def _verify_init(self, fs=None): + fs_select = fs + if fs_select is None: + fs_select = self.fs + status = fs_select.status() log.info("status = {0}".format(status)) target = [{'join_fscid': -1, 'name': info['name']} for info in status.get_all()] - self._verify_join_fs(target, status=status) + self._verify_join_fs(target, status=status, fs=fs_select) return (status, target) def _reach_target(self, target): @@ -109,12 +115,21 @@ class TestClusterAffinity(CephFSTestCase): fs2 = self.mds_cluster.newfs(name="cephfs2") status, target = self._verify_init() active = self.fs.get_active_names(status=status)[0] + status2, _ = self._verify_init(fs=fs2) + active2 = fs2.get_active_names(status=status2)[0] standbys = [info['name'] for info in status.get_standbys()] victim = standbys.pop() # Set a bogus fs on the others for mds in standbys: self.config_set('mds.'+mds, 'mds_join_fs', 'cephfs2') self._change_target_state(target, mds, {'join_fscid': fs2.id}) + # The active MDS for cephfs2 will be replaced by the MDS for which + # file system affinity has been set. Also, set the affinity for + # the earlier active MDS so that it is not chosen by the monitors + # as an active MDS for the existing file system. + log.info(f'assigning affinity to cephfs2 for active mds (mds.{active2})') + self.config_set(f'mds.{active2}', 'mds_join_fs', 'cephfs2') + self._change_target_state(target, active2, {'join_fscid': fs2.id}) self.fs.rank_fail() self._change_target_state(target, victim, {'state': 'up:active'}) self._reach_target(target) diff --git a/ceph/qa/tasks/cephfs/test_fstop.py b/ceph/qa/tasks/cephfs/test_fstop.py index ed76eaac2..8294fceec 100644 --- a/ceph/qa/tasks/cephfs/test_fstop.py +++ b/ceph/qa/tasks/cephfs/test_fstop.py @@ -66,7 +66,7 @@ class TestFSTop(CephFSTestCase): Tests 'cephfs-top --dump' output is valid """ def verify_fstop_metrics(metrics): - clients = metrics.get(self.fs.name, {}) + clients = metrics.get('filesystems').get(self.fs.name, {}) if str(self.mount_a.get_global_id()) in clients and \ str(self.mount_b.get_global_id()) in clients: return True diff --git a/ceph/qa/tasks/cephfs/test_mirroring.py b/ceph/qa/tasks/cephfs/test_mirroring.py index a5f8cdac7..c1a940e3f 100644 --- a/ceph/qa/tasks/cephfs/test_mirroring.py +++ b/ceph/qa/tasks/cephfs/test_mirroring.py @@ -1261,3 +1261,38 @@ class TestMirroring(CephFSTestCase): self.verify_snapshot('d2', 'snap0') self.disable_mirroring(self.primary_fs_name, self.primary_fs_id) + + def test_local_and_remote_dir_root_mode(self): + log.debug('reconfigure client auth caps') + cid = self.mount_b.client_id + data_pool = self.backup_fs.get_data_pool_name() + self.mds_cluster.mon_manager.raw_cluster_cmd_result( + 'auth', 'caps', f"client.{cid}", + 'mds', 'allow rw', + 'mon', 'allow r', + 'osd', f"allow rw pool={data_pool}, allow rw pool={data_pool}") + + log.debug(f'mounting filesystem {self.secondary_fs_name}') + self.mount_b.umount_wait() + self.mount_b.mount_wait(cephfs_name=self.secondary_fs_name) + + self.mount_a.run_shell(["mkdir", "l1"]) + self.mount_a.run_shell(["mkdir", "l1/.snap/snap0"]) + self.mount_a.run_shell(["chmod", "go-rwx", "l1"]) + + self.enable_mirroring(self.primary_fs_name, self.primary_fs_id) + self.add_directory(self.primary_fs_name, self.primary_fs_id, '/l1') + self.peer_add(self.primary_fs_name, self.primary_fs_id, "client.mirror_remote@ceph", self.secondary_fs_name) + + time.sleep(60) + self.check_peer_status(self.primary_fs_name, self.primary_fs_id, + "client.mirror_remote@ceph", '/l1', 'snap0', 1) + + mode_local = self.mount_a.run_shell(["stat", "--format=%A", "l1"]).stdout.getvalue().strip() + mode_remote = self.mount_b.run_shell(["stat", "--format=%A", "l1"]).stdout.getvalue().strip() + + self.assertTrue(mode_local == mode_remote, f"mode mismatch, local mode: {mode_local}, remote mode: {mode_remote}") + + self.disable_mirroring(self.primary_fs_name, self.primary_fs_id) + self.mount_a.run_shell(["rmdir", "l1/.snap/snap0"]) + self.mount_a.run_shell(["rmdir", "l1"]) diff --git a/ceph/qa/tasks/cephfs/test_nfs.py b/ceph/qa/tasks/cephfs/test_nfs.py index c14400520..61dce16f8 100644 --- a/ceph/qa/tasks/cephfs/test_nfs.py +++ b/ceph/qa/tasks/cephfs/test_nfs.py @@ -22,7 +22,7 @@ class TestNFS(MgrTestCase): return self._cmd("nfs", *args) def _nfs_complete_cmd(self, cmd): - return self.mgr_cluster.mon_manager.run_cluster_cmd(args=f"nfs {cmd}", + return self.mgr_cluster.mon_manager.run_cluster_cmd(args=["nfs"] + cmd, stdout=StringIO(), stderr=StringIO(), check_status=False) @@ -150,8 +150,8 @@ class TestNFS(MgrTestCase): try: # Disable any running nfs ganesha daemon self._check_nfs_server_status() - cluster_create = self._nfs_complete_cmd( - f'cluster create {self.cluster_id}') + cmd = ["cluster", "create", self.cluster_id] + cluster_create = self._nfs_complete_cmd(cmd) if cluster_create.stderr and 'cluster already exists' \ in cluster_create.stderr.getvalue(): self._test_delete_cluster() diff --git a/ceph/qa/tasks/cephfs/test_snap_schedules.py b/ceph/qa/tasks/cephfs/test_snap_schedules.py index 6fc53c267..41438ef86 100644 --- a/ceph/qa/tasks/cephfs/test_snap_schedules.py +++ b/ceph/qa/tasks/cephfs/test_snap_schedules.py @@ -364,6 +364,9 @@ class TestSnapSchedules(TestSnapSchedulesHelper): snap_stats['fs_count'] = fs_count snap_stats['db_count'] = db_count + log.debug(f'fs_count: {fs_count}') + log.debug(f'db_count: {db_count}') + return snap_stats def verify_snap_stats(self, dir_path): @@ -469,6 +472,42 @@ class TestSnapSchedules(TestSnapSchedulesHelper): # remove snapshot schedule self.fs_snap_schedule_cmd('remove', path="/bad-path") + def test_snap_schedule_for_number_of_snaps_retention(self): + """ + Test that number of snaps retained are as per user spec. + """ + total_snaps = 55 + test_dir = '/' + TestSnapSchedules.TEST_DIRECTORY + + self.mount_a.run_shell(['mkdir', '-p', test_dir[1:]]) + + # set a schedule on the dir + self.fs_snap_schedule_cmd('add', path=test_dir, snap_schedule='1M') + self.fs_snap_schedule_cmd('retention', 'add', path=test_dir, + retention_spec_or_period=f'{total_snaps}n') + exec_time = time.time() + + timo_1, snap_sfx = self.calc_wait_time_and_snap_name(exec_time, '1M') + + # verify snapshot schedule + self.verify_schedule(test_dir, ['1M']) + + # we wait for total_snaps snaps to be taken + wait_time = timo_1 + total_snaps * 60 + 15 + time.sleep(wait_time) + + snap_stats = self.get_snap_stats(test_dir) + self.assertTrue(snap_stats['fs_count'] == total_snaps) + self.assertTrue(snap_stats['db_count'] >= total_snaps) + + # remove snapshot schedule + self.fs_snap_schedule_cmd('remove', path=test_dir) + + # remove all scheduled snapshots + self.remove_snapshots(test_dir[1:]) + + self.mount_a.run_shell(['rmdir', test_dir[1:]]) + class TestSnapSchedulesSnapdir(TestSnapSchedulesHelper): def remove_snapshots(self, dir_path, sdn): diff --git a/ceph/qa/tasks/cephfs/test_volumes.py b/ceph/qa/tasks/cephfs/test_volumes.py index 3997ae90d..50cbdef3f 100644 --- a/ceph/qa/tasks/cephfs/test_volumes.py +++ b/ceph/qa/tasks/cephfs/test_volumes.py @@ -578,6 +578,24 @@ class TestVolumes(TestVolumesHelper): self.assertEqual(vol_info["used_size"], 0, "Size should be zero when volumes directory is empty") + def test_volume_info_pending_subvol_deletions(self): + """ + Tests the pending_subvolume_deletions in 'fs volume info' command + """ + subvolname = self._generate_random_subvolume_name() + # create subvolume + self._fs_cmd("subvolume", "create", self.volname, subvolname, "--mode=777") + # create 3K zero byte files + self._do_subvolume_io(subvolname, number_of_files=3000, file_size=0) + # Delete the subvolume + self._fs_cmd("subvolume", "rm", self.volname, subvolname) + # get volume metadata + vol_info = json.loads(self._get_volume_info(self.volname)) + self.assertNotEqual(vol_info['pending_subvolume_deletions'], 0, + "pending_subvolume_deletions should be 1") + # verify trash dir is clean + self._wait_for_trash_empty() + def test_volume_info_without_subvolumegroup(self): """ Tests the 'fs volume info' command without subvolume group @@ -7817,3 +7835,29 @@ class TestMisc(TestVolumesHelper): # remove group self._fs_cmd("subvolumegroup", "rm", self.volname, group) + +class TestPerModuleFinsherThread(TestVolumesHelper): + """ + Per module finisher thread tests related to mgr/volume cmds. + This is used in conjuction with check_counter with min val being 4 + as four subvolume cmds are run + """ + def test_volumes_module_finisher_thread(self): + subvol1, subvol2, subvol3 = self._generate_random_subvolume_name(3) + group = self._generate_random_group_name() + + # create group + self._fs_cmd("subvolumegroup", "create", self.volname, group) + + # create subvolumes in group + self._fs_cmd("subvolume", "create", self.volname, subvol1, "--group_name", group) + self._fs_cmd("subvolume", "create", self.volname, subvol2, "--group_name", group) + self._fs_cmd("subvolume", "create", self.volname, subvol3, "--group_name", group) + + self._fs_cmd("subvolume", "rm", self.volname, subvol1, group) + self._fs_cmd("subvolume", "rm", self.volname, subvol2, group) + self._fs_cmd("subvolume", "rm", self.volname, subvol3, group) + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + # verify trash dir is clean + self._wait_for_trash_empty() diff --git a/ceph/qa/tasks/check_counter.py b/ceph/qa/tasks/check_counter.py index daa81973b..40818f3f4 100644 --- a/ceph/qa/tasks/check_counter.py +++ b/ceph/qa/tasks/check_counter.py @@ -5,6 +5,8 @@ import json from teuthology.task import Task from teuthology import misc +from tasks import ceph_manager + log = logging.getLogger(__name__) @@ -30,8 +32,16 @@ class CheckCounter(Task): counters: mds: - "mds.dir_split" + - + name: "mds.dir_update" + min: 3 - workunit: ... """ + @property + def admin_remote(self): + first_mon = misc.get_first_mon(self.ctx, None) + (result,) = self.ctx.cluster.only(first_mon).remotes.keys() + return result def start(self): log.info("START") @@ -47,6 +57,10 @@ class CheckCounter(Task): if cluster_name is None: cluster_name = next(iter(self.ctx.managers.keys())) + + mon_manager = ceph_manager.CephManager(self.admin_remote, ctx=self.ctx, logger=log.getChild('ceph_manager')) + active_mgr = json.loads(mon_manager.raw_cluster_cmd("mgr", "dump", "--format=json-pretty"))["active_name"] + for daemon_type, counters in targets.items(): # List of 'a', 'b', 'c'... daemon_ids = list(misc.all_roles_of_type(self.ctx.cluster, daemon_type)) @@ -54,12 +68,15 @@ class CheckCounter(Task): self.ctx.daemons.get_daemon(daemon_type, daemon_id)) for daemon_id in daemon_ids]) + expected = set() seen = set() for daemon_id, daemon in daemons.items(): if not daemon.running(): log.info("Ignoring daemon {0}, it isn't running".format(daemon_id)) continue + elif daemon_type == 'mgr' and daemon_id != active_mgr: + continue else: log.debug("Getting stats from {0}".format(daemon_id)) @@ -72,23 +89,38 @@ class CheckCounter(Task): log.warning("No admin socket response from {0}, skipping".format(daemon_id)) continue + minval = '' + expected_val = '' for counter in counters: - subsys, counter_id = counter.split(".") - if subsys not in perf_dump or counter_id not in perf_dump[subsys]: - log.warning("Counter '{0}' not found on daemon {1}.{2}".format( - counter, daemon_type, daemon_id)) - continue - value = perf_dump[subsys][counter_id] + if isinstance(counter, dict): + name = counter['name'] + if 'min' in counter: + minval = counter['min'] + if 'expected_val' in counter: + expected_val = counter['expected_val'] + else: + name = counter + minval = 1 + expected.add(name) - log.info("Daemon {0}.{1} {2}={3}".format( - daemon_type, daemon_id, counter, value - )) + val = perf_dump + for key in name.split('.'): + if key not in val: + log.warning(f"Counter '{name}' not found on daemon {daemon_type}.{daemon_id}") + val = None + break - if value > 0: - seen.add(counter) + val = val[key] + + if val is not None: + log.info(f"Daemon {daemon_type}.{daemon_id} {name}={val}") + if isinstance(minval, int) and val >= minval: + seen.add(name) + elif isinstance(expected_val, int) and val == expected_val: + seen.add(name) if not dry_run: - unseen = set(counters) - set(seen) + unseen = set(expected) - set(seen) if unseen: raise RuntimeError("The following counters failed to be set " "on {0} daemons: {1}".format( diff --git a/ceph/qa/tasks/thrashosds-health.yaml b/ceph/qa/tasks/thrashosds-health.yaml index 1b2560d4e..2340944e8 100644 --- a/ceph/qa/tasks/thrashosds-health.yaml +++ b/ceph/qa/tasks/thrashosds-health.yaml @@ -16,3 +16,15 @@ overrides: - \(REQUEST_SLOW\) - \(TOO_FEW_PGS\) - slow request + - \(MON_DOWN\) + - osds down + - mons down + - flag\(s\) set + - out of quorum + - PG_ + - Reduced data availability + - stuck undersized + - backfill_toofull + - is down + - stuck peering + - acting diff --git a/ceph/qa/tasks/vstart_runner.py b/ceph/qa/tasks/vstart_runner.py index c88c93b80..5399e825d 100644 --- a/ceph/qa/tasks/vstart_runner.py +++ b/ceph/qa/tasks/vstart_runner.py @@ -63,7 +63,7 @@ try: except: pass -def init_log(): +def init_log(log_level=logging.INFO): global log if log is not None: del log @@ -78,7 +78,7 @@ def init_log(): datefmt='%Y-%m-%dT%H:%M:%S') handler.setFormatter(formatter) log.addHandler(handler) - log.setLevel(logging.INFO) + log.setLevel(log_level) log = None init_log() @@ -1332,19 +1332,15 @@ def teardown_cluster(): def clear_old_log(): - from os import stat - try: - stat(logpath) - # would need an update when making this py3 compatible. Use FileNotFound - # instead. - except OSError: + os.stat(logpath) + except FileNotFoundError: return else: os.remove(logpath) with open(logpath, 'w') as logfile: logfile.write('') - init_log() + init_log(log.level) log.debug('logging in a fresh file now...') diff --git a/ceph/qa/workunits/cephtool/test.sh b/ceph/qa/workunits/cephtool/test.sh index 2fa2d0d24..5c22c30e1 100755 --- a/ceph/qa/workunits/cephtool/test.sh +++ b/ceph/qa/workunits/cephtool/test.sh @@ -350,20 +350,21 @@ function test_tiering_1() ceph osd pool ls detail -f json | jq '.[] | select(.pool_name == "slow2") | .application_metadata["rados"]' | grep '{}' ceph osd pool ls detail -f json | jq '.[] | select(.pool_name == "cache") | .application_metadata["rados"]' | grep '{}' ceph osd pool ls detail -f json | jq '.[] | select(.pool_name == "cache2") | .application_metadata["rados"]' | grep '{}' - # forward and proxy are removed/deprecated + # forward is removed/deprecated expect_false ceph osd tier cache-mode cache forward expect_false ceph osd tier cache-mode cache forward --yes-i-really-mean-it - expect_false ceph osd tier cache-mode cache proxy - expect_false ceph osd tier cache-mode cache proxy --yes-i-really-mean-it # test some state transitions ceph osd tier cache-mode cache writeback expect_false ceph osd tier cache-mode cache readonly expect_false ceph osd tier cache-mode cache readonly --yes-i-really-mean-it + ceph osd tier cache-mode cache proxy ceph osd tier cache-mode cache readproxy ceph osd tier cache-mode cache none ceph osd tier cache-mode cache readonly --yes-i-really-mean-it ceph osd tier cache-mode cache none ceph osd tier cache-mode cache writeback + ceph osd tier cache-mode cache proxy + ceph osd tier cache-mode cache writeback expect_false ceph osd tier cache-mode cache none expect_false ceph osd tier cache-mode cache readonly --yes-i-really-mean-it # test with dirty objects in the tier pool @@ -371,7 +372,7 @@ function test_tiering_1() rados -p cache put /etc/passwd /etc/passwd flush_pg_stats # 1 dirty object in pool 'cache' - ceph osd tier cache-mode cache readproxy + ceph osd tier cache-mode cache proxy expect_false ceph osd tier cache-mode cache none expect_false ceph osd tier cache-mode cache readonly --yes-i-really-mean-it ceph osd tier cache-mode cache writeback @@ -380,7 +381,7 @@ function test_tiering_1() rados -p cache cache-flush-evict-all flush_pg_stats # no dirty objects in pool 'cache' - ceph osd tier cache-mode cache readproxy + ceph osd tier cache-mode cache proxy ceph osd tier cache-mode cache none ceph osd tier cache-mode cache readonly --yes-i-really-mean-it TRIES=0 @@ -1114,7 +1115,7 @@ function test_mon_mds() # Removing tier should be permitted because the underlying pool is # replicated (#11504 case) - ceph osd tier cache-mode mds-tier readproxy + ceph osd tier cache-mode mds-tier proxy ceph osd tier remove-overlay fs_metadata ceph osd tier remove fs_metadata mds-tier ceph osd pool delete mds-tier mds-tier --yes-i-really-really-mean-it diff --git a/ceph/qa/workunits/cls/test_cls_cmpomap.sh b/ceph/qa/workunits/cls/test_cls_cmpomap.sh index af079f6e6..6e3a4d830 100755 --- a/ceph/qa/workunits/cls/test_cls_cmpomap.sh +++ b/ceph/qa/workunits/cls/test_cls_cmpomap.sh @@ -1,5 +1,6 @@ #!/bin/sh -e -ceph_test_cls_cmpomap +# this test case changed in 16.2.6 so had been failing in pacific-p2p upgrades since +ceph_test_cls_cmpomap --gtest_filter=-CmpOmap.cmp_vals_u64_invalid_default exit 0 diff --git a/ceph/qa/workunits/kernel_untar_build.sh b/ceph/qa/workunits/kernel_untar_build.sh index 9b60f065c..602ce04a7 100755 --- a/ceph/qa/workunits/kernel_untar_build.sh +++ b/ceph/qa/workunits/kernel_untar_build.sh @@ -2,11 +2,11 @@ set -e -wget -O linux.tar.gz http://download.ceph.com/qa/linux-5.4.tar.gz +wget -O linux.tar.xz http://download.ceph.com/qa/linux-6.5.11.tar.xz mkdir t cd t -tar xzf ../linux.tar.gz +tar xJf ../linux.tar.xz cd linux* make defconfig make -j`grep -c processor /proc/cpuinfo` diff --git a/ceph/qa/workunits/mgr/test_per_module_finisher.sh b/ceph/qa/workunits/mgr/test_per_module_finisher.sh new file mode 100755 index 000000000..09937bc02 --- /dev/null +++ b/ceph/qa/workunits/mgr/test_per_module_finisher.sh @@ -0,0 +1,37 @@ +#!/usr/bin/env bash +set -ex + +# This testcase tests the per module finisher stats for enabled modules +# using check counter (qa/tasks/check_counter.py). + +# 'balancer' commands +ceph balancer pool ls + +# 'crash' commands +ceph crash ls +ceph crash ls-new + +# 'device' commands +ceph device query-daemon-health-metrics mon.a + +# 'iostat' command +ceph iostat & +pid=$! +sleep 3 +kill -SIGTERM $pid + +# 'pg_autoscaler' command +ceph osd pool autoscale-status + +# 'progress' command +ceph progress +ceph progress json + +# 'status' commands +ceph fs status +ceph osd status + +# 'telemetry' commands +ceph telemetry status + +echo OK diff --git a/ceph/qa/workunits/mon/rbd_snaps_ops.sh b/ceph/qa/workunits/mon/rbd_snaps_ops.sh index eb88565ea..0e5b16b7b 100755 --- a/ceph/qa/workunits/mon/rbd_snaps_ops.sh +++ b/ceph/qa/workunits/mon/rbd_snaps_ops.sh @@ -36,6 +36,7 @@ expect 'rbd --pool=test snap ls image' 0 expect 'rbd --pool=test snap rm image@snapshot' 0 expect 'ceph osd pool mksnap test snapshot' 22 +expect 'rados -p test mksnap snapshot' 1 expect 'ceph osd pool delete test test --yes-i-really-really-mean-it' 0 @@ -52,6 +53,8 @@ expect 'rbd --pool test-foo snap create image@snapshot' 0 ceph osd pool delete test-bar test-bar --yes-i-really-really-mean-it || true expect 'ceph osd pool create test-bar 8' 0 expect 'ceph osd pool application enable test-bar rbd' +# "rados cppool" without --yes-i-really-mean-it should fail +expect 'rados cppool test-foo test-bar' 1 expect 'rados cppool test-foo test-bar --yes-i-really-mean-it' 0 expect 'rbd --pool test-bar snap rm image@snapshot' 95 expect 'ceph osd pool delete test-foo test-foo --yes-i-really-really-mean-it' 0 diff --git a/ceph/qa/workunits/rbd/cli_generic.sh b/ceph/qa/workunits/rbd/cli_generic.sh index 3222a38c2..9b2648fd0 100755 --- a/ceph/qa/workunits/rbd/cli_generic.sh +++ b/ceph/qa/workunits/rbd/cli_generic.sh @@ -1258,7 +1258,6 @@ test_trash_purge_schedule_recovery() { jq 'select(.name == "rbd_support")' | jq -r '[.addrvec[0].addr, "/", .addrvec[0].nonce|tostring] | add') ceph osd blocklist add $CLIENT_ADDR - ceph osd blocklist ls | grep $CLIENT_ADDR # Check that you can add a trash purge schedule after a few retries expect_fail rbd trash purge schedule add -p rbd3 10m @@ -1414,7 +1413,6 @@ test_mirror_snapshot_schedule_recovery() { jq 'select(.name == "rbd_support")' | jq -r '[.addrvec[0].addr, "/", .addrvec[0].nonce|tostring] | add') ceph osd blocklist add $CLIENT_ADDR - ceph osd blocklist ls | grep $CLIENT_ADDR # Check that you can add a mirror snapshot schedule after a few retries expect_fail rbd mirror snapshot schedule add -p rbd3/ns1 --image test1 2m @@ -1523,7 +1521,6 @@ test_perf_image_iostat_recovery() { jq 'select(.name == "rbd_support")' | jq -r '[.addrvec[0].addr, "/", .addrvec[0].nonce|tostring] | add') ceph osd blocklist add $CLIENT_ADDR - ceph osd blocklist ls | grep $CLIENT_ADDR expect_fail rbd perf image iostat --format json rbd3/ns sleep 10 @@ -1655,7 +1652,6 @@ test_tasks_recovery() { jq 'select(.name == "rbd_support")' | jq -r '[.addrvec[0].addr, "/", .addrvec[0].nonce|tostring] | add') ceph osd blocklist add $CLIENT_ADDR - ceph osd blocklist ls | grep $CLIENT_ADDR expect_fail ceph rbd task add flatten rbd2/clone1 sleep 10 diff --git a/ceph/qa/workunits/rbd/krbd_watch_errors.sh b/ceph/qa/workunits/rbd/krbd_watch_errors.sh new file mode 100755 index 000000000..f650d2a74 --- /dev/null +++ b/ceph/qa/workunits/rbd/krbd_watch_errors.sh @@ -0,0 +1,53 @@ +#!/usr/bin/env bash + +set -ex +set -o pipefail + +function refresh_loop() { + local dev_id="$1" + + set +x + + local i + for ((i = 1; ; i++)); do + echo 1 | sudo tee "${SYSFS_DIR}/${dev_id}/refresh" > /dev/null + if ((i % 100 == 0)); then + echo "Refreshed ${i} times" + fi + done +} + +readonly SYSFS_DIR="/sys/bus/rbd/devices" +readonly IMAGE_NAME="watch-errors-test" + +rbd create -s 1G --image-feature exclusive-lock "${IMAGE_NAME}" + +# induce a watch error every 30 seconds +dev="$(sudo rbd device map -o osdkeepalive=60 "${IMAGE_NAME}")" +dev_id="${dev#/dev/rbd}" + +# constantly refresh, not just on watch errors +refresh_loop "${dev_id}" & +refresh_pid=$! + +sudo dmesg -C + +# test that none of the above triggers a deadlock with a workload +fio --name test --filename="${dev}" --ioengine=libaio --direct=1 \ + --rw=randwrite --norandommap --randrepeat=0 --bs=512 --iodepth=128 \ + --time_based --runtime=1h --eta=never + +num_errors="$(dmesg | grep -c "rbd${dev_id}: encountered watch error")" +echo "Recorded ${num_errors} watch errors" + +kill "${refresh_pid}" +wait + +sudo rbd device unmap "${dev}" + +if ((num_errors < 60)); then + echo "Too few watch errors" + exit 1 +fi + +echo OK diff --git a/ceph/qa/workunits/rbd/rbd-nbd.sh b/ceph/qa/workunits/rbd/rbd-nbd.sh index 85f383713..319b28ef3 100755 --- a/ceph/qa/workunits/rbd/rbd-nbd.sh +++ b/ceph/qa/workunits/rbd/rbd-nbd.sh @@ -417,6 +417,16 @@ DEV= rbd feature disable ${POOL}/${IMAGE} journaling rbd config image rm ${POOL}/${IMAGE} rbd_discard_granularity_bytes +# test that disabling a feature so that the op is proxied to rbd-nbd +# (arranged here by blkdiscard before "rbd feature disable") doesn't hang +DEV=`_sudo rbd device --device-type nbd map ${POOL}/${IMAGE}` +get_pid ${POOL} +rbd feature enable ${POOL}/${IMAGE} journaling +_sudo blkdiscard --offset 0 --length 4096 ${DEV} +rbd feature disable ${POOL}/${IMAGE} journaling +unmap_device ${DEV} ${PID} +DEV= + # test that rbd_op_threads setting takes effect EXPECTED=`ceph-conf --show-config-value librados_thread_count` DEV=`_sudo rbd device --device-type nbd map ${POOL}/${IMAGE}` diff --git a/ceph/qa/workunits/rbd/rbd_mirror_helpers.sh b/ceph/qa/workunits/rbd/rbd_mirror_helpers.sh index ca715d854..f4961b925 100755 --- a/ceph/qa/workunits/rbd/rbd_mirror_helpers.sh +++ b/ceph/qa/workunits/rbd/rbd_mirror_helpers.sh @@ -1169,6 +1169,16 @@ wait_for_snap_removed_from_trash() return 1 } +count_mirror_snaps() +{ + local cluster=$1 + local pool=$2 + local image=$3 + + rbd --cluster ${cluster} snap ls ${pool}/${image} --all | + grep -c -F " mirror (" +} + write_image() { local cluster=$1 diff --git a/ceph/qa/workunits/rbd/rbd_mirror_journal.sh b/ceph/qa/workunits/rbd/rbd_mirror_journal.sh index 56a8b13a9..54f6aeec8 100755 --- a/ceph/qa/workunits/rbd/rbd_mirror_journal.sh +++ b/ceph/qa/workunits/rbd/rbd_mirror_journal.sh @@ -214,7 +214,29 @@ wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+replaying' 'primary wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${image} 'up+stopped' compare_images ${POOL} ${image} -# force promote +testlog "TEST: failover / failback loop" +for i in `seq 1 20`; do + demote_image ${CLUSTER2} ${POOL} ${image} + wait_for_image_replay_stopped ${CLUSTER1} ${POOL} ${image} + wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+unknown' + wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${image} 'up+unknown' + promote_image ${CLUSTER1} ${POOL} ${image} + wait_for_image_replay_started ${CLUSTER2} ${POOL} ${image} + wait_for_replay_complete ${CLUSTER2} ${CLUSTER1} ${POOL} ${image} + wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+stopped' + wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${image} 'up+replaying' + demote_image ${CLUSTER1} ${POOL} ${image} + wait_for_image_replay_stopped ${CLUSTER2} ${POOL} ${image} + wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+unknown' + wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${image} 'up+unknown' + promote_image ${CLUSTER2} ${POOL} ${image} + wait_for_image_replay_started ${CLUSTER1} ${POOL} ${image} + wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${image} + wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${image} 'up+stopped' + wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+replaying' +done + +testlog "TEST: force promote" force_promote_image=test_force_promote create_image ${CLUSTER2} ${POOL} ${force_promote_image} write_image ${CLUSTER2} ${POOL} ${force_promote_image} 100 diff --git a/ceph/qa/workunits/rbd/rbd_mirror_snapshot.sh b/ceph/qa/workunits/rbd/rbd_mirror_snapshot.sh index 0060440fb..c70d48b09 100755 --- a/ceph/qa/workunits/rbd/rbd_mirror_snapshot.sh +++ b/ceph/qa/workunits/rbd/rbd_mirror_snapshot.sh @@ -220,7 +220,32 @@ wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+replaying' wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${image} 'up+stopped' compare_images ${POOL} ${image} -# force promote +testlog "TEST: failover / failback loop" +for i in `seq 1 20`; do + demote_image ${CLUSTER2} ${POOL} ${image} + wait_for_image_replay_stopped ${CLUSTER1} ${POOL} ${image} + wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+unknown' + wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${image} 'up+unknown' + promote_image ${CLUSTER1} ${POOL} ${image} + wait_for_image_replay_started ${CLUSTER2} ${POOL} ${image} + wait_for_replay_complete ${CLUSTER2} ${CLUSTER1} ${POOL} ${image} + wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+stopped' + wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${image} 'up+replaying' + demote_image ${CLUSTER1} ${POOL} ${image} + wait_for_image_replay_stopped ${CLUSTER2} ${POOL} ${image} + wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+unknown' + wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${image} 'up+unknown' + promote_image ${CLUSTER2} ${POOL} ${image} + wait_for_image_replay_started ${CLUSTER1} ${POOL} ${image} + wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${POOL} ${image} + wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${image} 'up+stopped' + wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${image} 'up+replaying' +done +# check that demote (or other mirror snapshots) don't pile up +test "$(count_mirror_snaps ${CLUSTER1} ${POOL} ${image})" -le 3 +test "$(count_mirror_snaps ${CLUSTER2} ${POOL} ${image})" -le 3 + +testlog "TEST: force promote" force_promote_image=test_force_promote create_image_and_enable_mirror ${CLUSTER2} ${POOL} ${force_promote_image} write_image ${CLUSTER2} ${POOL} ${force_promote_image} 100 diff --git a/ceph/qa/workunits/rbd/rbd_support_module_recovery.sh b/ceph/qa/workunits/rbd/rbd_support_module_recovery.sh new file mode 100755 index 000000000..e9defced2 --- /dev/null +++ b/ceph/qa/workunits/rbd/rbd_support_module_recovery.sh @@ -0,0 +1,77 @@ +#!/bin/bash +set -ex + +POOL=rbd +IMAGE_PREFIX=image +NUM_IMAGES=20 +RUN_TIME=3600 + +rbd mirror pool enable ${POOL} image +rbd mirror pool peer add ${POOL} dummy + +# Create images and schedule their mirror snapshots +for ((i = 1; i <= ${NUM_IMAGES}; i++)); do + rbd create -s 1G --image-feature exclusive-lock ${POOL}/${IMAGE_PREFIX}$i + rbd mirror image enable ${POOL}/${IMAGE_PREFIX}$i snapshot + rbd mirror snapshot schedule add -p ${POOL} --image ${IMAGE_PREFIX}$i 1m +done + +# Run fio workloads on images via kclient +# Test the recovery of the rbd_support module and its scheduler from their +# librbd client being blocklisted while a exclusive lock gets passed around +# between their librbd client and a kclient trying to take mirror snapshots +# and perform I/O on the same image. +for ((i = 1; i <= ${NUM_IMAGES}; i++)); do + DEVS[$i]=$(sudo rbd device map ${POOL}/${IMAGE_PREFIX}$i) + fio --name=fiotest --filename=${DEVS[$i]} --rw=randrw --bs=4K --direct=1 \ + --ioengine=libaio --iodepth=2 --runtime=43200 --time_based \ + &> /dev/null & +done + +# Repeatedly blocklist rbd_support module's client ~10s after the module +# recovers from previous blocklisting +CURRENT_TIME=$(date +%s) +END_TIME=$((CURRENT_TIME + RUN_TIME)) +PREV_CLIENT_ADDR="" +CLIENT_ADDR="" +while ((CURRENT_TIME <= END_TIME)); do + if [[ -n "${CLIENT_ADDR}" ]] && + [[ "${CLIENT_ADDR}" != "${PREV_CLIENT_ADDR}" ]]; then + ceph osd blocklist add ${CLIENT_ADDR} + # Confirm rbd_support module's client is blocklisted + ceph osd blocklist ls | grep -q ${CLIENT_ADDR} + PREV_CLIENT_ADDR=${CLIENT_ADDR} + fi + sleep 10 + CLIENT_ADDR=$(ceph mgr dump | + jq .active_clients[] | + jq 'select(.name == "rbd_support")' | + jq -r '[.addrvec[0].addr, "/", .addrvec[0].nonce|tostring] | add') + CURRENT_TIME=$(date +%s) +done + +# Confirm that rbd_support module recovered from repeated blocklisting +# Check that you can add a mirror snapshot schedule after a few retries +for ((i = 1; i <= 24; i++)); do + rbd mirror snapshot schedule add -p ${POOL} \ + --image ${IMAGE_PREFIX}1 2m && break + sleep 10 +done +rbd mirror snapshot schedule ls -p ${POOL} --image ${IMAGE_PREFIX}1 | + grep 'every 2m' +# Verify that the schedule present before client blocklisting is preserved +rbd mirror snapshot schedule ls -p ${POOL} --image ${IMAGE_PREFIX}1 | + grep 'every 1m' +rbd mirror snapshot schedule rm -p ${POOL} --image ${IMAGE_PREFIX}1 2m +for ((i = 1; i <= ${NUM_IMAGES}; i++)); do + rbd mirror snapshot schedule rm -p ${POOL} --image ${IMAGE_PREFIX}$i 1m +done + +# cleanup +killall fio || true +wait +for ((i = 1; i <= ${NUM_IMAGES}; i++)); do + sudo rbd device unmap ${DEVS[$i]} +done + +echo OK diff --git a/ceph/qa/workunits/rgw/common.py b/ceph/qa/workunits/rgw/common.py index 235c36c95..2c9c5d035 100755 --- a/ceph/qa/workunits/rgw/common.py +++ b/ceph/qa/workunits/rgw/common.py @@ -5,6 +5,9 @@ import subprocess import logging as log import boto3 import botocore.exceptions +import random +import json +from time import sleep log.basicConfig(format = '%(message)s', level=log.DEBUG) log.getLogger('botocore').setLevel(log.CRITICAL) @@ -55,3 +58,46 @@ def boto_connect(access_key, secret_key, config=None): except botocore.exceptions.ConnectionError: # retry with ssl return try_connect('443', True, 'https') + +def put_objects(bucket, key_list): + objs = [] + for key in key_list: + o = bucket.put_object(Key=key, Body=b"some_data") + objs.append((o.key, o.version_id)) + return objs + +def create_unlinked_objects(conn, bucket, key_list): + # creates an unlinked/unlistable object for each key in key_list + + object_versions = [] + try: + exec_cmd('ceph config set client rgw_debug_inject_set_olh_err 2') + exec_cmd('ceph config set client rgw_debug_inject_olh_cancel_modification_err true') + sleep(1) + for key in key_list: + tag = str(random.randint(0, 1_000_000)) + try: + bucket.put_object(Key=key, Body=b"some_data", Metadata = { + 'tag': tag, + }) + except Exception as e: + log.debug(e) + out = exec_cmd(f'radosgw-admin bi list --bucket {bucket.name} --object {key}') + instance_entries = filter( + lambda x: x['type'] == 'instance', + json.loads(out.replace(b'\x80', b'0x80'))) + found = False + for ie in instance_entries: + instance_id = ie['entry']['instance'] + ov = conn.ObjectVersion(bucket.name, key, instance_id).head() + if ov['Metadata'] and ov['Metadata']['tag'] == tag: + object_versions.append((key, instance_id)) + found = True + break + if not found: + raise Exception(f'failed to create unlinked object for key={key}') + finally: + exec_cmd('ceph config rm client rgw_debug_inject_set_olh_err') + exec_cmd('ceph config rm client rgw_debug_inject_olh_cancel_modification_err') + return object_versions + diff --git a/ceph/qa/workunits/rgw/run-bucket-check.sh b/ceph/qa/workunits/rgw/run-bucket-check.sh new file mode 100755 index 000000000..85e02db5e --- /dev/null +++ b/ceph/qa/workunits/rgw/run-bucket-check.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash +set -ex + +# assume working ceph environment (radosgw-admin in path) and rgw on localhost:80 +# localhost::443 for ssl + +mydir=`dirname $0` + +python3 -m venv $mydir +source $mydir/bin/activate +pip install pip --upgrade +pip install boto3 + +## run test +$mydir/bin/python3 $mydir/test_rgw_bucket_check.py + +deactivate +echo OK. + diff --git a/ceph/qa/workunits/rgw/test_rgw_bucket_check.py b/ceph/qa/workunits/rgw/test_rgw_bucket_check.py new file mode 100755 index 000000000..bfa6d65d6 --- /dev/null +++ b/ceph/qa/workunits/rgw/test_rgw_bucket_check.py @@ -0,0 +1,194 @@ +#!/usr/bin/env python3 + +import logging as log +import json +import botocore +from common import exec_cmd, create_user, boto_connect, put_objects, create_unlinked_objects +from botocore.config import Config + +""" +Tests behavior of radosgw-admin bucket check commands. +""" +# The test cases in this file have been annotated for inventory. +# To extract the inventory (in csv format) use the command: +# +# grep '^ *# TESTCASE' | sed 's/^ *# TESTCASE //' +# +# + +""" Constants """ +USER = 'check-tester' +DISPLAY_NAME = 'Check Testing' +ACCESS_KEY = 'OJODXSLNX4LUNHQG99PA' +SECRET_KEY = '3l6ffld34qaymfomuh832j94738aie2x4p2o8h6n' +BUCKET_NAME = 'check-bucket' + +def main(): + """ + execute bucket check commands + """ + create_user(USER, DISPLAY_NAME, ACCESS_KEY, SECRET_KEY) + + connection = boto_connect(ACCESS_KEY, SECRET_KEY, Config(retries = { + 'total_max_attempts': 1, + })) + + # pre-test cleanup + try: + bucket = connection.Bucket(BUCKET_NAME) + bucket.objects.all().delete() + bucket.object_versions.all().delete() + bucket.delete() + except botocore.exceptions.ClientError as e: + if not e.response['Error']['Code'] == 'NoSuchBucket': + raise + + bucket = connection.create_bucket(Bucket=BUCKET_NAME) + + null_version_keys = ['a', 'z'] + null_version_objs = put_objects(bucket, null_version_keys) + + connection.BucketVersioning(BUCKET_NAME).enable() + + ok_keys = ['a', 'b', 'c', 'd'] + unlinked_keys = ['c', 'd', 'e', 'f'] + ok_objs = put_objects(bucket, ok_keys) + + # TESTCASE 'recalculated bucket check stats are correct' + log.debug('TEST: recalculated bucket check stats are correct\n') + exec_cmd(f'radosgw-admin bucket check --fix --bucket {BUCKET_NAME}') + out = exec_cmd(f'radosgw-admin bucket stats --bucket {BUCKET_NAME}') + json_out = json.loads(out) + log.debug(json_out['usage']) + assert json_out['usage']['rgw.main']['num_objects'] == 6 + + # TESTCASE 'bucket check unlinked does not report normal entries' + log.debug('TEST: bucket check unlinked does not report normal entries\n') + out = exec_cmd(f'radosgw-admin bucket check unlinked --bucket {BUCKET_NAME} --min-age-hours 0 --dump-keys') + json_out = json.loads(out) + assert len(json_out) == 0 + + unlinked_objs = create_unlinked_objects(connection, bucket, unlinked_keys) + + # TESTCASE 'bucket check unlinked finds unlistable entries' + log.debug('TEST: bucket check unlinked finds unlistable entries\n') + out = exec_cmd(f'radosgw-admin bucket check unlinked --bucket {BUCKET_NAME} --min-age-hours 0 --dump-keys') + json_out = json.loads(out) + assert len(json_out) == len(unlinked_keys) + + # TESTCASE 'unlinked entries are not listable' + log.debug('TEST: unlinked entries are not listable\n') + for ov in bucket.object_versions.all(): + assert (ov.key, ov.version_id) not in unlinked_objs, f'object "{ov.key}:{ov.version_id}" was found in bucket listing' + + # TESTCASE 'GET returns 404 for unlinked entry keys that have no other versions' + log.debug('TEST: GET returns 404 for unlinked entry keys that have no other versions\n') + noent_keys = set(unlinked_keys) - set(ok_keys) + for key in noent_keys: + try: + bucket.Object(key).get() + assert False, 'GET did not return 404 for key={key} with no prior successful PUT' + except botocore.exceptions.ClientError as e: + assert e.response['ResponseMetadata']['HTTPStatusCode'] == 404 + + # TESTCASE 'bucket check unlinked fixes unlistable entries' + log.debug('TEST: bucket check unlinked fixes unlistable entries\n') + out = exec_cmd(f'radosgw-admin bucket check unlinked --bucket {BUCKET_NAME} --fix --min-age-hours 0 --rgw-olh-pending-timeout-sec 0 --dump-keys') + json_out = json.loads(out) + assert len(json_out) == len(unlinked_keys) + for o in unlinked_objs: + try: + connection.ObjectVersion(bucket.name, o[0], o[1]).head() + assert False, f'head for unlistable object {o[0]}:{o[1]} succeeded after fix' + except botocore.exceptions.ClientError as e: + assert e.response['ResponseMetadata']['HTTPStatusCode'] == 404 + + # TESTCASE 'bucket check unlinked fix does not affect normal entries' + log.debug('TEST: bucket check unlinked does not affect normal entries\n') + all_listable = list(bucket.object_versions.all()) + assert len(all_listable) == len(ok_keys) + len(null_version_keys), 'some normal objects were not accounted for in object listing after unlinked fix' + for o in ok_objs: + assert o in map(lambda x: (x.key, x.version_id), all_listable), "normal object not listable after fix" + connection.ObjectVersion(bucket.name, o[0], o[1]).head() + + # TESTCASE 'bucket check unlinked does not find new unlistable entries after fix' + log.debug('TEST: bucket check unlinked does not find new unlistable entries after fix\n') + out = exec_cmd(f'radosgw-admin bucket check unlinked --bucket {BUCKET_NAME} --min-age-hours 0 --dump-keys') + json_out = json.loads(out) + assert len(json_out) == 0 + + # for this set of keys we can produce leftover OLH object/entries by + # deleting the normal object instance since we should already have a leftover + # pending xattr on the OLH object due to the errors associated with the + # prior unlinked entries that were created for the same keys + leftover_pending_xattr_keys = set(ok_keys).intersection(unlinked_keys) + objs_to_delete = filter(lambda x: x[0] in leftover_pending_xattr_keys, ok_objs) + + for o in objs_to_delete: + connection.ObjectVersion(bucket.name, o[0], o[1]).delete() + + for key in leftover_pending_xattr_keys: + out = exec_cmd(f'radosgw-admin bi list --bucket {BUCKET_NAME} --object {key}') + idx_entries = json.loads(out.replace(b'\x80', b'0x80')) + assert len(idx_entries) > 0, 'failed to create leftover OLH entries for key {key}' + + # TESTCASE 'bucket check olh finds leftover OLH entries' + log.debug('TEST: bucket check olh finds leftover OLH entries\n') + out = exec_cmd(f'radosgw-admin bucket check olh --bucket {BUCKET_NAME} --dump-keys') + json_out = json.loads(out) + assert len(json_out) == len(leftover_pending_xattr_keys) + + # TESTCASE 'bucket check olh fixes leftover OLH entries' + log.debug('TEST: bucket check olh fixes leftover OLH entries\n') + out = exec_cmd(f'radosgw-admin bucket check olh --bucket {BUCKET_NAME} --fix --rgw-olh-pending-timeout-sec 0 --dump-keys') + json_out = json.loads(out) + assert len(json_out) == len(leftover_pending_xattr_keys) + + for key in leftover_pending_xattr_keys: + out = exec_cmd(f'radosgw-admin bi list --bucket {BUCKET_NAME} --object {key}') + idx_entries = json.loads(out.replace(b'\x80', b'0x80')) + assert len(idx_entries) == 0, 'index entries still exist for key={key} after olh fix' + + # TESTCASE 'bucket check olh does not find new leftover OLH entries after fix' + log.debug('TEST: bucket check olh does not find new leftover OLH entries after fix\n') + out = exec_cmd(f'radosgw-admin bucket check olh --bucket {BUCKET_NAME} --dump-keys') + json_out = json.loads(out) + assert len(json_out) == 0 + + # TESTCASE 'bucket check fixes do not affect null version objects' + log.debug('TEST: verify that bucket check fixes do not affect null version objects\n') + for o in null_version_objs: + connection.ObjectVersion(bucket.name, o[0], 'null').head() + + all_versions = list(map(lambda x: (x.key, x.version_id), bucket.object_versions.all())) + for key in null_version_keys: + assert (key, 'null') in all_versions + + # TESTCASE 'bucket check stats are correct in the presence of unlinked entries' + log.debug('TEST: bucket check stats are correct in the presence of unlinked entries\n') + bucket.object_versions.all().delete() + null_version_objs = put_objects(bucket, null_version_keys) + ok_objs = put_objects(bucket, ok_keys) + unlinked_objs = create_unlinked_objects(connection, bucket, unlinked_keys) + exec_cmd(f'radosgw-admin bucket check --fix --bucket {BUCKET_NAME}') + out = exec_cmd(f'radosgw-admin bucket check unlinked --bucket {BUCKET_NAME} --fix --min-age-hours 0 --rgw-olh-pending-timeout-sec 0 --dump-keys') + json_out = json.loads(out) + assert len(json_out) == len(unlinked_keys) + bucket.object_versions.all().delete() + out = exec_cmd(f'radosgw-admin bucket stats --bucket {BUCKET_NAME}') + json_out = json.loads(out) + log.debug(json_out['usage']) + assert json_out['usage']['rgw.main']['size'] == 0 + assert json_out['usage']['rgw.main']['num_objects'] == 0 + assert json_out['usage']['rgw.main']['size_actual'] == 0 + assert json_out['usage']['rgw.main']['size_kb'] == 0 + assert json_out['usage']['rgw.main']['size_kb_actual'] == 0 + assert json_out['usage']['rgw.main']['size_kb_utilized'] == 0 + + # Clean up + log.debug("Deleting bucket {}".format(BUCKET_NAME)) + bucket.object_versions.all().delete() + bucket.delete() + +main() +log.info("Completed bucket check tests") diff --git a/ceph/qa/workunits/rgw/test_rgw_reshard.py b/ceph/qa/workunits/rgw/test_rgw_reshard.py index 54fddb469..eed6fc75e 100755 --- a/ceph/qa/workunits/rgw/test_rgw_reshard.py +++ b/ceph/qa/workunits/rgw/test_rgw_reshard.py @@ -4,7 +4,7 @@ import time import logging as log import json import os -from common import exec_cmd, boto_connect, create_user +from common import exec_cmd, boto_connect, create_user, put_objects, create_unlinked_objects """ Rgw manual and dynamic resharding testing against a running instance @@ -75,14 +75,14 @@ def main(): execute manual and dynamic resharding commands """ create_user(USER, DISPLAY_NAME, ACCESS_KEY, SECRET_KEY) - + connection = boto_connect(ACCESS_KEY, SECRET_KEY) # create a bucket bucket1 = connection.create_bucket(Bucket=BUCKET_NAME1) bucket2 = connection.create_bucket(Bucket=BUCKET_NAME2) ver_bucket = connection.create_bucket(Bucket=VER_BUCKET_NAME) - connection.BucketVersioning('ver_bucket') + connection.BucketVersioning(VER_BUCKET_NAME).enable() bucket_stats1 = get_bucket_stats(BUCKET_NAME1) bucket_stats2 = get_bucket_stats(BUCKET_NAME2) @@ -199,6 +199,28 @@ def main(): json_op = json.loads(cmd) assert len(json_op) == 0 + # TESTCASE 'check that bucket stats are correct after reshard with unlinked entries' + log.debug('TEST: check that bucket stats are correct after reshard with unlinked entries\n') + ver_bucket.object_versions.all().delete() + ok_keys = ['a', 'b', 'c'] + unlinked_keys = ['x', 'y', 'z'] + put_objects(ver_bucket, ok_keys) + create_unlinked_objects(connection, ver_bucket, unlinked_keys) + cmd = exec_cmd(f'radosgw-admin bucket reshard --bucket {VER_BUCKET_NAME} --num-shards 17 --yes-i-really-mean-it') + out = exec_cmd(f'radosgw-admin bucket check unlinked --bucket {VER_BUCKET_NAME} --fix --min-age-hours 0 --rgw-olh-pending-timeout-sec 0 --dump-keys') + json_out = json.loads(out) + assert len(json_out) == len(unlinked_keys) + ver_bucket.object_versions.all().delete() + out = exec_cmd(f'radosgw-admin bucket stats --bucket {VER_BUCKET_NAME}') + json_out = json.loads(out) + log.debug(json_out['usage']) + assert json_out['usage']['rgw.main']['size'] == 0 + assert json_out['usage']['rgw.main']['num_objects'] == 0 + assert json_out['usage']['rgw.main']['size_actual'] == 0 + assert json_out['usage']['rgw.main']['size_kb'] == 0 + assert json_out['usage']['rgw.main']['size_kb_actual'] == 0 + assert json_out['usage']['rgw.main']['size_kb_utilized'] == 0 + # Clean up log.debug("Deleting bucket %s", BUCKET_NAME1) bucket1.objects.all().delete() diff --git a/ceph/src/.git_version b/ceph/src/.git_version index 4a2b845de..cb2c609ad 100644 --- a/ceph/src/.git_version +++ b/ceph/src/.git_version @@ -1,2 +1,2 @@ -238ba602515df21ea7ffc75c88db29f9e5ef12c9 -16.2.14 +618f440892089921c3e944a991122ddc44e60516 +16.2.15 diff --git a/ceph/src/CMakeLists.txt b/ceph/src/CMakeLists.txt index a9aef68ed..6f5dd11f7 100644 --- a/ceph/src/CMakeLists.txt +++ b/ceph/src/CMakeLists.txt @@ -601,7 +601,9 @@ if(WITH_LIBRADOSSTRIPER) add_subdirectory(libradosstriper) endif() -add_subdirectory(mgr) +if(WITH_MGR) + add_subdirectory(mgr) +endif() set(librados_config_srcs librados-config.cc) diff --git a/ceph/src/blk/kernel/KernelDevice.cc b/ceph/src/blk/kernel/KernelDevice.cc index fb9d8e449..55e19442d 100644 --- a/ceph/src/blk/kernel/KernelDevice.cc +++ b/ceph/src/blk/kernel/KernelDevice.cc @@ -126,8 +126,25 @@ int KernelDevice::open(const string& p) int r = 0, i = 0; dout(1) << __func__ << " path " << path << dendl; + struct stat statbuf; + bool is_block; + r = stat(path.c_str(), &statbuf); + if (r != 0) { + derr << __func__ << " stat got: " << cpp_strerror(r) << dendl; + goto out_fail; + } + is_block = (statbuf.st_mode & S_IFMT) == S_IFBLK; for (i = 0; i < WRITE_LIFE_MAX; i++) { - int fd = ::open(path.c_str(), O_RDWR | O_DIRECT); + int flags = 0; + if (lock_exclusive && is_block && (i == 0)) { + // If opening block device use O_EXCL flag. It gives us best protection, + // as no other process can overwrite the data for as long as we are running. + // For block devices ::flock is not enough, + // since 2 different inodes with same major/minor can be locked. + // Exclusion by O_EXCL works in containers too. + flags |= O_EXCL; + } + int fd = ::open(path.c_str(), O_RDWR | O_DIRECT | flags); if (fd < 0) { r = -errno; break; @@ -180,6 +197,10 @@ int KernelDevice::open(const string& p) } if (lock_exclusive) { + // We need to keep soft locking (via flock()) because O_EXCL does not work for regular files. + // This is as good as we can get. Other processes can still overwrite the data, + // but at least we are protected from mounting same device twice in ceph processes. + // We also apply soft locking for block devices, as it populates /proc/locks. (see lslocks) r = _lock(); if (r < 0) { derr << __func__ << " failed to lock " << path << ": " << cpp_strerror(r) diff --git a/ceph/src/ceph-volume/ceph_volume/devices/lvm/batch.py b/ceph/src/ceph-volume/ceph_volume/devices/lvm/batch.py index 600450e40..d867fe2d8 100644 --- a/ceph/src/ceph-volume/ceph_volume/devices/lvm/batch.py +++ b/ceph/src/ceph-volume/ceph_volume/devices/lvm/batch.py @@ -53,7 +53,7 @@ def get_physical_osds(devices, args): data_slots = args.osds_per_device if args.data_slots: data_slots = max(args.data_slots, args.osds_per_device) - rel_data_size = 1.0 / data_slots + rel_data_size = args.data_allocate_fraction / data_slots mlogger.debug('relative data size: {}'.format(rel_data_size)) ret = [] for dev in devices: @@ -297,6 +297,12 @@ class Batch(object): ' if more slots then osds-per-device are specified, slots' 'will stay unoccupied'), ) + parser.add_argument( + '--data-allocate-fraction', + type=arg_validators.ValidFraction(), + help='Fraction to allocate from data device (0,1.0]', + default=1.0 + ) parser.add_argument( '--block-db-size', type=disk.Size.parse, diff --git a/ceph/src/ceph-volume/ceph_volume/devices/lvm/zap.py b/ceph/src/ceph-volume/ceph_volume/devices/lvm/zap.py index 94751f23f..08ca4315e 100644 --- a/ceph/src/ceph-volume/ceph_volume/devices/lvm/zap.py +++ b/ceph/src/ceph-volume/ceph_volume/devices/lvm/zap.py @@ -202,8 +202,11 @@ class Zap(object): """ if device.is_encrypted: # find the holder - holders = [ - '/dev/%s' % holder for holder in device.sys_api.get('holders', []) + pname = device.sys_api.get('parent') + devname = device.sys_api.get('devname') + parent_device = Device(f'/dev/{pname}') + holders: List[str] = [ + f'/dev/{holder}' for holder in parent_device.sys_api['partitions'][devname]['holders'] ] for mapper_uuid in os.listdir('/dev/mapper'): mapper_path = os.path.join('/dev/mapper', mapper_uuid) diff --git a/ceph/src/ceph-volume/ceph_volume/devices/raw/common.py b/ceph/src/ceph-volume/ceph_volume/devices/raw/common.py index 19de81fe5..89ee285be 100644 --- a/ceph/src/ceph-volume/ceph_volume/devices/raw/common.py +++ b/ceph/src/ceph-volume/ceph_volume/devices/raw/common.py @@ -49,4 +49,10 @@ def create_parser(prog, description): action='store_true', help='Enable device encryption via dm-crypt', ) + parser.add_argument( + '--osd-id', + help='Reuse an existing OSD id', + default=None, + type=arg_validators.valid_osd_id, + ) return parser diff --git a/ceph/src/ceph-volume/ceph_volume/devices/raw/list.py b/ceph/src/ceph-volume/ceph_volume/devices/raw/list.py index a9eb41312..0f801701b 100644 --- a/ceph/src/ceph-volume/ceph_volume/devices/raw/list.py +++ b/ceph/src/ceph-volume/ceph_volume/devices/raw/list.py @@ -88,18 +88,21 @@ class List(object): # parent isn't bluestore, then the child could be a valid bluestore OSD. If we fail to # determine whether a parent is bluestore, we should err on the side of not reporting # the child so as not to give a false negative. - for info_device in info_devices: - if 'PKNAME' in info_device and info_device['PKNAME'] != "": - parent = info_device['PKNAME'] - try: - if disk.has_bluestore_label(parent): - logger.warning(('ignoring child device {} whose parent {} is a BlueStore OSD.'.format(dev, parent), - 'device is likely a phantom Atari partition. device info: {}'.format(info_device))) - continue - except OSError as e: - logger.error(('ignoring child device {} to avoid reporting invalid BlueStore data from phantom Atari partitions.'.format(dev), - 'failed to determine if parent device {} is BlueStore. err: {}'.format(parent, e))) + info_device = [info for info in info_devices if info['NAME'] == dev][0] + if info_device['TYPE'] == 'lvm': + # lvm devices are not raw devices + continue + if 'PKNAME' in info_device and info_device['PKNAME'] != "": + parent = info_device['PKNAME'] + try: + if disk.has_bluestore_label(parent): + logger.warning(('ignoring child device {} whose parent {} is a BlueStore OSD.'.format(dev, parent), + 'device is likely a phantom Atari partition. device info: {}'.format(info_device))) continue + except OSError as e: + logger.error(('ignoring child device {} to avoid reporting invalid BlueStore data from phantom Atari partitions.'.format(dev), + 'failed to determine if parent device {} is BlueStore. err: {}'.format(parent, e))) + continue bs_info = _get_bluestore_info(dev) if bs_info is None: diff --git a/ceph/src/ceph-volume/ceph_volume/devices/raw/prepare.py b/ceph/src/ceph-volume/ceph_volume/devices/raw/prepare.py index 2179db233..aabe82473 100644 --- a/ceph/src/ceph-volume/ceph_volume/devices/raw/prepare.py +++ b/ceph/src/ceph-volume/ceph_volume/devices/raw/prepare.py @@ -111,7 +111,9 @@ class Prepare(object): # reuse a given ID if it exists, otherwise create a new ID self.osd_id = prepare_utils.create_id( - osd_fsid, json.dumps(secrets)) + osd_fsid, + json.dumps(secrets), + osd_id=self.args.osd_id) prepare_bluestore( self.args.data, diff --git a/ceph/src/ceph-volume/ceph_volume/tests/conftest.py b/ceph/src/ceph-volume/ceph_volume/tests/conftest.py index 8ec8ca131..75628f847 100644 --- a/ceph/src/ceph-volume/ceph_volume/tests/conftest.py +++ b/ceph/src/ceph-volume/ceph_volume/tests/conftest.py @@ -319,3 +319,7 @@ def fake_filesystem(fs): fs.create_dir('/sys/block/sda/queue') fs.create_dir('/sys/block/rbd0') yield fs + +@pytest.fixture(params=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.999, 1.0]) +def data_allocate_fraction(request): + return request.param diff --git a/ceph/src/ceph-volume/ceph_volume/tests/devices/lvm/test_batch.py b/ceph/src/ceph-volume/ceph_volume/tests/devices/lvm/test_batch.py index 8a0f8df8c..d27134d53 100644 --- a/ceph/src/ceph-volume/ceph_volume/tests/devices/lvm/test_batch.py +++ b/ceph/src/ceph-volume/ceph_volume/tests/devices/lvm/test_batch.py @@ -57,6 +57,7 @@ class TestBatch(object): bluestore=True, block_db_size="1G", dmcrypt=True, + data_allocate_fraction=1.0, ) b = batch.Batch([]) plan = b.get_plan(args) @@ -79,6 +80,7 @@ class TestBatch(object): bluestore=True, block_db_size="1G", dmcrypt=True, + data_allocate_fraction=1.0, ) b = batch.Batch([]) plan = b.get_plan(args) @@ -104,6 +106,7 @@ class TestBatch(object): bluestore=True, block_db_size="1G", dmcrypt=True, + data_allocate_fraction=1.0, ) b = batch.Batch([]) plan = b.get_plan(args) @@ -131,6 +134,7 @@ class TestBatch(object): bluestore=True, block_db_size="1G", dmcrypt=True, + data_allocate_fraction=1.0, ) b = batch.Batch([]) plan = b.get_plan(args) @@ -178,30 +182,35 @@ class TestBatch(object): osds_per_device): conf_ceph_stub('[global]\nfsid=asdf-lkjh') args = factory(data_slots=1, osds_per_device=osds_per_device, - osd_ids=[], dmcrypt=False) + osd_ids=[], dmcrypt=False, + data_allocate_fraction=1.0) osds = batch.get_physical_osds(mock_devices_available, args) assert len(osds) == len(mock_devices_available) * osds_per_device def test_get_physical_osds_rel_size(self, factory, mock_devices_available, conf_ceph_stub, - osds_per_device): + osds_per_device, + data_allocate_fraction): args = factory(data_slots=1, osds_per_device=osds_per_device, - osd_ids=[], dmcrypt=False) + osd_ids=[], dmcrypt=False, + data_allocate_fraction=data_allocate_fraction) osds = batch.get_physical_osds(mock_devices_available, args) for osd in osds: - assert osd.data[1] == 1.0 / osds_per_device + assert osd.data[1] == data_allocate_fraction / osds_per_device def test_get_physical_osds_abs_size(self, factory, mock_devices_available, conf_ceph_stub, - osds_per_device): + osds_per_device, + data_allocate_fraction): conf_ceph_stub('[global]\nfsid=asdf-lkjh') args = factory(data_slots=1, osds_per_device=osds_per_device, - osd_ids=[], dmcrypt=False) + osd_ids=[], dmcrypt=False, + data_allocate_fraction=data_allocate_fraction) osds = batch.get_physical_osds(mock_devices_available, args) for osd, dev in zip(osds, mock_devices_available): - assert osd.data[2] == int(dev.vg_size[0] / osds_per_device) + assert osd.data[2] == int(dev.vg_size[0] * (data_allocate_fraction / osds_per_device)) def test_get_physical_osds_osd_ids(self, factory, mock_devices_available, diff --git a/ceph/src/ceph-volume/ceph_volume/tests/util/test_arg_validators.py b/ceph/src/ceph-volume/ceph_volume/tests/util/test_arg_validators.py index 59ca12619..35bf295b9 100644 --- a/ceph/src/ceph-volume/ceph_volume/tests/util/test_arg_validators.py +++ b/ceph/src/ceph-volume/ceph_volume/tests/util/test_arg_validators.py @@ -336,3 +336,32 @@ class TestValidBatchDataDevice(object): ) self.validator.zap = False assert self.validator('/dev/foo') + +class TestValidFraction(object): + + def setup(self): + self.validator = arg_validators.ValidFraction() + + def test_fraction_is_valid(self, fake_call): + result = self.validator('0.8') + assert result == 0.8 + + def test_fraction_not_float(self, fake_call): + with pytest.raises(ValueError): + self.validator('xyz') + + def test_fraction_is_nan(self, fake_call): + with pytest.raises(argparse.ArgumentError): + self.validator('NaN') + + def test_fraction_is_negative(self, fake_call): + with pytest.raises(argparse.ArgumentError): + self.validator('-1.0') + + def test_fraction_is_zero(self, fake_call): + with pytest.raises(argparse.ArgumentError): + self.validator('0.0') + + def test_fraction_is_greater_one(self, fake_call): + with pytest.raises(argparse.ArgumentError): + self.validator('1.1') diff --git a/ceph/src/ceph-volume/ceph_volume/tests/util/test_device.py b/ceph/src/ceph-volume/ceph_volume/tests/util/test_device.py index 8eef3ff00..4e3252934 100644 --- a/ceph/src/ceph-volume/ceph_volume/tests/util/test_device.py +++ b/ceph/src/ceph-volume/ceph_volume/tests/util/test_device.py @@ -152,14 +152,6 @@ class TestDevice(object): disk = device.Device("/dev/sda") assert disk.is_device is True - @patch("ceph_volume.util.disk.has_bluestore_label", lambda x: False) - def test_is_partition(self, fake_call, device_info): - data = {"/dev/sda1": {"foo": "bar"}} - lsblk = {"TYPE": "part", "NAME": "sda1", "PKNAME": "sda"} - device_info(devices=data, lsblk=lsblk) - disk = device.Device("/dev/sda1") - assert disk.is_partition - @patch("ceph_volume.util.disk.has_bluestore_label", lambda x: False) def test_mpath_device_is_device(self, fake_call, device_info): data = {"/dev/foo": {"foo": "bar"}} @@ -241,7 +233,7 @@ class TestDevice(object): @patch("ceph_volume.util.disk.has_bluestore_label", lambda x: False) def test_reject_removable_device(self, fake_call, device_info): - data = {"/dev/sdb": {"removable": 1}} + data = {"/dev/sdb": {"removable": "1"}} lsblk = {"TYPE": "disk", "NAME": "sdb"} device_info(devices=data,lsblk=lsblk) disk = device.Device("/dev/sdb") @@ -249,7 +241,7 @@ class TestDevice(object): @patch("ceph_volume.util.disk.has_bluestore_label", lambda x: False) def test_reject_device_with_gpt_headers(self, fake_call, device_info): - data = {"/dev/sdb": {"removable": 0, "size": 5368709120}} + data = {"/dev/sdb": {"removable": "0", "size": 5368709120}} lsblk = {"TYPE": "disk", "NAME": "sdb"} blkid= {"PTTYPE": "gpt"} device_info( @@ -262,7 +254,7 @@ class TestDevice(object): @patch("ceph_volume.util.disk.has_bluestore_label", lambda x: False) def test_accept_non_removable_device(self, fake_call, device_info): - data = {"/dev/sdb": {"removable": 0, "size": 5368709120}} + data = {"/dev/sdb": {"removable": "0", "size": 5368709120}} lsblk = {"TYPE": "disk", "NAME": "sdb"} device_info(devices=data,lsblk=lsblk) disk = device.Device("/dev/sdb") @@ -286,7 +278,7 @@ class TestDevice(object): fake_call): m_os_path_islink.return_value = True m_os_path_realpath.return_value = '/dev/sdb' - data = {"/dev/sdb": {"ro": 0, "size": 5368709120}} + data = {"/dev/sdb": {"ro": "0", "size": 5368709120}} lsblk = {"TYPE": "disk"} device_info(devices=data,lsblk=lsblk) disk = device.Device("/dev/test_symlink") @@ -304,7 +296,7 @@ class TestDevice(object): fake_call): m_os_path_islink.return_value = True m_os_readlink.return_value = '/dev/dm-0' - data = {"/dev/mapper/mpatha": {"ro": 0, "size": 5368709120}} + data = {"/dev/mapper/mpatha": {"ro": "0", "size": 5368709120}} lsblk = {"TYPE": "disk"} device_info(devices=data,lsblk=lsblk) disk = device.Device("/dev/mapper/mpatha") @@ -312,7 +304,7 @@ class TestDevice(object): @patch("ceph_volume.util.disk.has_bluestore_label", lambda x: False) def test_reject_readonly_device(self, fake_call, device_info): - data = {"/dev/cdrom": {"ro": 1}} + data = {"/dev/cdrom": {"ro": "1"}} lsblk = {"TYPE": "disk", "NAME": "cdrom"} device_info(devices=data,lsblk=lsblk) disk = device.Device("/dev/cdrom") @@ -328,7 +320,7 @@ class TestDevice(object): @patch("ceph_volume.util.disk.has_bluestore_label", lambda x: False) def test_accept_non_readonly_device(self, fake_call, device_info): - data = {"/dev/sda": {"ro": 0, "size": 5368709120}} + data = {"/dev/sda": {"ro": "0", "size": 5368709120}} lsblk = {"TYPE": "disk", "NAME": "sda"} device_info(devices=data,lsblk=lsblk) disk = device.Device("/dev/sda") @@ -594,10 +586,10 @@ class TestDeviceOrdering(object): def setup(self): self.data = { - "/dev/sda": {"removable": 0}, - "/dev/sdb": {"removable": 1}, # invalid - "/dev/sdc": {"removable": 0}, - "/dev/sdd": {"removable": 1}, # invalid + "/dev/sda": {"removable": "0"}, + "/dev/sdb": {"removable": "1"}, # invalid + "/dev/sdc": {"removable": "0"}, + "/dev/sdd": {"removable": "1"}, # invalid } @patch("ceph_volume.util.disk.has_bluestore_label", lambda x: False) diff --git a/ceph/src/ceph-volume/ceph_volume/tests/util/test_disk.py b/ceph/src/ceph-volume/ceph_volume/tests/util/test_disk.py index fcd644a86..94325b475 100644 --- a/ceph/src/ceph-volume/ceph_volume/tests/util/test_disk.py +++ b/ceph/src/ceph-volume/ceph_volume/tests/util/test_disk.py @@ -3,6 +3,11 @@ import pytest from ceph_volume.util import disk from mock.mock import patch + @patch('ceph_volume.util.disk.os.path.exists', MagicMock(return_value=True)) + @patch('ceph_volume.util.disk.get_partitions', MagicMock(return_value={"sda1": "sda"})) + def test_is_partition(self): + assert disk.is_partition('sda1') + class TestLsblkParser(object): @@ -225,7 +230,6 @@ class TestGetDevices(object): result = disk.get_devices(_sys_block_path=str(tmpdir)) assert result == {} - @patch('ceph_volume.util.disk.is_locked_raw_device', lambda x: False) def test_sda_block_is_found(self, patched_get_block_devs_sysfs, fake_filesystem): sda_path = '/dev/sda' patched_get_block_devs_sysfs.return_value = [[sda_path, sda_path, 'disk']] @@ -235,7 +239,6 @@ class TestGetDevices(object): assert result[sda_path]['model'] == '' assert result[sda_path]['partitions'] == {} - @patch('ceph_volume.util.disk.is_locked_raw_device', lambda x: False) def test_sda_size(self, patched_get_block_devs_sysfs, fake_filesystem): sda_path = '/dev/sda' patched_get_block_devs_sysfs.return_value = [[sda_path, sda_path, 'disk']] @@ -244,7 +247,6 @@ class TestGetDevices(object): assert list(result.keys()) == [sda_path] assert result[sda_path]['human_readable_size'] == '512.00 KB' - @patch('ceph_volume.util.disk.is_locked_raw_device', lambda x: False) def test_sda_sectorsize_fallsback(self, patched_get_block_devs_sysfs, fake_filesystem): # if no sectorsize, it will use queue/hw_sector_size sda_path = '/dev/sda' @@ -254,7 +256,6 @@ class TestGetDevices(object): assert list(result.keys()) == [sda_path] assert result[sda_path]['sectorsize'] == '1024' - @patch('ceph_volume.util.disk.is_locked_raw_device', lambda x: False) def test_sda_sectorsize_from_logical_block(self, patched_get_block_devs_sysfs, fake_filesystem): sda_path = '/dev/sda' patched_get_block_devs_sysfs.return_value = [[sda_path, sda_path, 'disk']] @@ -262,7 +263,6 @@ class TestGetDevices(object): result = disk.get_devices() assert result[sda_path]['sectorsize'] == '99' - @patch('ceph_volume.util.disk.is_locked_raw_device', lambda x: False) def test_sda_sectorsize_does_not_fallback(self, patched_get_block_devs_sysfs, fake_filesystem): sda_path = '/dev/sda' patched_get_block_devs_sysfs.return_value = [[sda_path, sda_path, 'disk']] @@ -271,7 +271,6 @@ class TestGetDevices(object): result = disk.get_devices() assert result[sda_path]['sectorsize'] == '99' - @patch('ceph_volume.util.disk.is_locked_raw_device', lambda x: False) def test_is_rotational(self, patched_get_block_devs_sysfs, fake_filesystem): sda_path = '/dev/sda' patched_get_block_devs_sysfs.return_value = [[sda_path, sda_path, 'disk']] @@ -279,13 +278,21 @@ class TestGetDevices(object): result = disk.get_devices() assert result[sda_path]['rotational'] == '1' - @patch('ceph_volume.util.disk.is_locked_raw_device', lambda x: False) def test_is_ceph_rbd(self, patched_get_block_devs_sysfs, fake_filesystem): rbd_path = '/dev/rbd0' patched_get_block_devs_sysfs.return_value = [[rbd_path, rbd_path, 'disk']] result = disk.get_devices() assert rbd_path not in result + def test_actuator_device(self, patched_get_block_devs_sysfs, fake_filesystem): + sda_path = '/dev/sda' + fake_actuator_nb = 2 + patched_get_block_devs_sysfs.return_value = [[sda_path, sda_path, 'disk']] + for actuator in range(0, fake_actuator_nb): + fake_filesystem.create_dir(f'/sys/block/sda/queue/independent_access_ranges/{actuator}') + result = disk.get_devices() + assert result[sda_path]['actuators'] == fake_actuator_nb + class TestSizeCalculations(object): diff --git a/ceph/src/ceph-volume/ceph_volume/util/arg_validators.py b/ceph/src/ceph-volume/ceph_volume/util/arg_validators.py index 270c8a648..655f7cd55 100644 --- a/ceph/src/ceph-volume/ceph_volume/util/arg_validators.py +++ b/ceph/src/ceph-volume/ceph_volume/util/arg_validators.py @@ -1,5 +1,6 @@ import argparse import os +import math from ceph_volume import terminal, decorators, process from ceph_volume.util.device import Device from ceph_volume.util import disk @@ -220,3 +221,14 @@ def exclude_group_options(parser, groups, argv=None): terminal.warning(msg) last_group = group_name last_flag = flag + +class ValidFraction(object): + """ + Validate fraction is in (0, 1.0] + """ + + def __call__(self, fraction): + fraction_float = float(fraction) + if math.isnan(fraction_float) or fraction_float <= 0.0 or fraction_float > 1.0: + raise argparse.ArgumentError(None, 'Fraction %f not in (0,1.0]' % fraction_float) + return fraction_float diff --git a/ceph/src/ceph-volume/ceph_volume/util/device.py b/ceph/src/ceph-volume/ceph_volume/util/device.py index c5eab1310..c5250d435 100644 --- a/ceph/src/ceph-volume/ceph_volume/util/device.py +++ b/ceph/src/ceph-volume/ceph_volume/util/device.py @@ -85,6 +85,7 @@ class Device(object): 'lsm_data', ] pretty_report_sys_fields = [ + 'actuators', 'human_readable_size', 'model', 'removable', @@ -581,9 +582,8 @@ class Device(object): def _check_generic_reject_reasons(self): reasons = [ - ('removable', 1, 'removable'), - ('ro', 1, 'read-only'), - ('locked', 1, 'locked'), + ('id_bus', 'usb', 'id_bus'), + ('ro', '1', 'read-only'), ] rejected = [reason for (k, v, reason) in reasons if self.sys_api.get(k, '') == v] @@ -619,6 +619,8 @@ class Device(object): rejected.append('Has GPT headers') if self.has_partitions: rejected.append('Has partitions') + if self.has_fs: + rejected.append('Has a FileSystem') return rejected def _check_lvm_reject_reasons(self): diff --git a/ceph/src/ceph-volume/ceph_volume/util/disk.py b/ceph/src/ceph-volume/ceph_volume/util/disk.py index 90527e209..8f03f8a0c 100644 --- a/ceph/src/ceph-volume/ceph_volume/util/disk.py +++ b/ceph/src/ceph-volume/ceph_volume/util/disk.py @@ -6,6 +6,7 @@ import time from ceph_volume import process from ceph_volume.api import lvm from ceph_volume.util.system import get_file_contents +from typing import Dict, List, Any logger = logging.getLogger(__name__) @@ -360,30 +361,18 @@ def is_device(dev): return False # fallback to stat - return _stat_is_device(os.lstat(dev).st_mode) + return _stat_is_device(os.lstat(dev).st_mode) and not is_partition(dev) -def is_partition(dev): +def is_partition(dev: str) -> bool: """ Boolean to determine if a given device is a partition, like /dev/sda1 """ if not os.path.exists(dev): return False - # use lsblk first, fall back to using stat - TYPE = lsblk(dev).get('TYPE') - if TYPE: - return TYPE == 'part' - # fallback to stat - stat_obj = os.stat(dev) - if _stat_is_device(stat_obj.st_mode): - return False - - major = os.major(stat_obj.st_rdev) - minor = os.minor(stat_obj.st_rdev) - if os.path.exists('/sys/dev/block/%d:%d/partition' % (major, minor)): - return True - return False + partitions = get_partitions() + return dev.split("/")[-1] in partitions def is_ceph_rbd(dev): @@ -734,28 +723,6 @@ def is_mapper_device(device_name): return device_name.startswith(('/dev/mapper', '/dev/dm-')) -def is_locked_raw_device(disk_path): - """ - A device can be locked by a third party software like a database. - To detect that case, the device is opened in Read/Write and exclusive mode - """ - open_flags = (os.O_RDWR | os.O_EXCL) - open_mode = 0 - fd = None - - try: - fd = os.open(disk_path, open_flags, open_mode) - except OSError: - return 1 - - try: - os.close(fd) - except OSError: - return 1 - - return 0 - - class AllowLoopDevices(object): allow = False warned = False @@ -781,36 +748,34 @@ class AllowLoopDevices(object): allow_loop_devices = AllowLoopDevices() -def get_block_devs_sysfs(_sys_block_path='/sys/block', _sys_dev_block_path='/sys/dev/block', device=''): - def holder_inner_loop(): +def get_block_devs_sysfs(_sys_block_path: str = '/sys/block', _sys_dev_block_path: str = '/sys/dev/block', device: str = '') -> List[List[str]]: + def holder_inner_loop() -> bool: for holder in holders: # /sys/block/sdy/holders/dm-8/dm/uuid - holder_dm_type = get_file_contents(os.path.join(_sys_block_path, dev, f'holders/{holder}/dm/uuid')).split('-')[0].lower() + holder_dm_type: str = get_file_contents(os.path.join(_sys_block_path, dev, f'holders/{holder}/dm/uuid')).split('-')[0].lower() if holder_dm_type == 'mpath': return True # First, get devices that are _not_ partitions - result = list() + result: List[List[str]] = list() if not device: - dev_names = os.listdir(_sys_block_path) + dev_names: List[str] = os.listdir(_sys_block_path) else: dev_names = [device] for dev in dev_names: - name = kname = os.path.join("/dev", dev) + name = kname = pname = os.path.join("/dev", dev) if not os.path.exists(name): continue - type_ = 'disk' - holders = os.listdir(os.path.join(_sys_block_path, dev, 'holders')) - if get_file_contents(os.path.join(_sys_block_path, dev, 'removable')) == "1": - continue + type_: str = 'disk' + holders: List[str] = os.listdir(os.path.join(_sys_block_path, dev, 'holders')) if holder_inner_loop(): continue - dm_dir_path = os.path.join(_sys_block_path, dev, 'dm') + dm_dir_path: str = os.path.join(_sys_block_path, dev, 'dm') if os.path.isdir(dm_dir_path): - dm_type = get_file_contents(os.path.join(dm_dir_path, 'uuid')) - type_ = dm_type.split('-')[0].lower() - basename = get_file_contents(os.path.join(dm_dir_path, 'name')) - name = os.path.join("/dev/mapper", basename) + dm_type: str = get_file_contents(os.path.join(dm_dir_path, 'uuid')) + type_: List[str] = dm_type.split('-')[0].lower() + basename: str = get_file_contents(os.path.join(dm_dir_path, 'name')) + name: str = os.path.join("/dev/mapper", basename) if dev.startswith('loop'): if not allow_loop_devices(): continue @@ -818,17 +783,27 @@ def get_block_devs_sysfs(_sys_block_path='/sys/block', _sys_dev_block_path='/sys if not os.path.exists(os.path.join(_sys_block_path, dev, 'loop')): continue type_ = 'loop' - result.append([kname, name, type_]) + result.append([kname, name, type_, pname]) # Next, look for devices that _are_ partitions - for item in os.listdir(_sys_dev_block_path): - is_part = get_file_contents(os.path.join(_sys_dev_block_path, item, 'partition')) == "1" - dev = os.path.basename(os.readlink(os.path.join(_sys_dev_block_path, item))) - if not is_part: - continue - name = kname = os.path.join("/dev", dev) - result.append([name, kname, "part"]) + partitions: Dict[str, str] = get_partitions() + for partition in partitions.keys(): + name = kname = os.path.join("/dev", partition) + result.append([name, kname, "part", partitions[partition]]) return sorted(result, key=lambda x: x[0]) +def get_partitions(_sys_dev_block_path ='/sys/dev/block') -> List[str]: + devices: List[str] = os.listdir(_sys_dev_block_path) + result: Dict[str, str] = dict() + for device in devices: + device_path: str = os.path.join(_sys_dev_block_path, device) + is_partition: bool = int(get_file_contents(os.path.join(device_path, 'partition'), '0')) > 0 + if not is_partition: + continue + + partition_sys_name: str = os.path.basename(os.path.realpath(device_path)) + parent_device_sys_name: str = os.path.realpath(device_path).split('/')[-2:-1][0] + result[partition_sys_name] = parent_device_sys_name + return result def get_devices(_sys_block_path='/sys/block', device=''): """ @@ -845,17 +820,19 @@ def get_devices(_sys_block_path='/sys/block', device=''): block_devs = get_block_devs_sysfs(_sys_block_path) - block_types = ['disk', 'mpath'] + block_types = ['disk', 'mpath', 'part'] if allow_loop_devices(): block_types.append('loop') for block in block_devs: + metadata: Dict[str, Any] = {} devname = os.path.basename(block[0]) diskname = block[1] if block[2] not in block_types: continue sysdir = os.path.join(_sys_block_path, devname) - metadata = {} + if block[2] == 'part': + sysdir = os.path.join(_sys_block_path, block[3], devname) # If the device is ceph rbd it gets excluded if is_ceph_rbd(diskname): @@ -882,11 +859,24 @@ def get_devices(_sys_block_path='/sys/block', device=''): for key, file_ in facts: metadata[key] = get_file_contents(os.path.join(sysdir, file_)) - device_slaves = os.listdir(os.path.join(sysdir, 'slaves')) + if block[2] != 'part': + device_slaves = os.listdir(os.path.join(sysdir, 'slaves')) + metadata['partitions'] = get_partitions_facts(sysdir) + if device_slaves: metadata['device_nodes'] = ','.join(device_slaves) else: - metadata['device_nodes'] = devname + if block[2] == 'part': + metadata['device_nodes'] = block[3] + else: + metadata['device_nodes'] = devname + + metadata['actuators'] = "" + if os.path.isdir(sysdir + "/queue/independent_access_ranges/"): + actuators = 0 + while os.path.isdir(sysdir + "/queue/independent_access_ranges/" + str(actuators)): + actuators += 1 + metadata['actuators'] = actuators metadata['scheduler_mode'] = "" scheduler = get_file_contents(sysdir + "/queue/scheduler") @@ -907,8 +897,13 @@ def get_devices(_sys_block_path='/sys/block', device=''): metadata['size'] = float(size) * 512 metadata['human_readable_size'] = human_readable_size(metadata['size']) metadata['path'] = diskname - metadata['locked'] = is_locked_raw_device(metadata['path']) + metadata['devname'] = devname metadata['type'] = block[2] + metadata['parent'] = block[3] + + # some facts from udevadm + p = udevadm_property(sysdir) + metadata['id_bus'] = p.get('ID_BUS', '') device_facts[diskname] = metadata return device_facts diff --git a/ceph/src/cephadm/cephadm b/ceph/src/cephadm/cephadm index 18b02d3bf..5b85dadfb 100755 --- a/ceph/src/cephadm/cephadm +++ b/ceph/src/cephadm/cephadm @@ -139,6 +139,17 @@ class EndPoint: return f'{self.ip}:{self.port}' +class DeploymentType(Enum): + # Fresh deployment of a daemon. + DEFAULT = 'Deploy' + # Redeploying a daemon. Works the same as fresh + # deployment minus port checking. + REDEPLOY = 'Redeploy' + # Reconfiguring a daemon. Rewrites config + # files and potentially restarts daemon. + RECONFIG = 'Reconfig' + + class BaseConfig: def __init__(self) -> None: @@ -743,6 +754,7 @@ class CephIscsi(object): mounts[os.path.join(data_dir, 'keyring')] = '/etc/ceph/keyring:z' mounts[os.path.join(data_dir, 'iscsi-gateway.cfg')] = '/etc/ceph/iscsi-gateway.cfg:z' mounts[os.path.join(data_dir, 'configfs')] = '/sys/kernel/config' + mounts[os.path.join(data_dir, 'tcmu-runner-entrypoint.sh')] = '/usr/local/scripts/tcmu-runner-entrypoint.sh' mounts[log_dir] = '/var/log:z' mounts['/dev'] = '/dev' return mounts @@ -806,9 +818,19 @@ class CephIscsi(object): configfs_dir = os.path.join(data_dir, 'configfs') makedirs(configfs_dir, uid, gid, 0o755) + # set up the tcmu-runner entrypoint script + # to be mounted into the container. For more info + # on why we need this script, see the + # tcmu_runner_entrypoint_script function + self.files['tcmu-runner-entrypoint.sh'] = self.tcmu_runner_entrypoint_script() + # populate files from the config-json populate_files(data_dir, self.files, uid, gid) + # we want the tcmu runner entrypoint script to be executable + # populate_files will give it 0o600 by default + os.chmod(os.path.join(data_dir, 'tcmu-runner-entrypoint.sh'), 0o700) + @staticmethod def configfs_mount_umount(data_dir, mount=True): # type: (str, bool) -> List[str] @@ -821,10 +843,47 @@ class CephIscsi(object): 'umount {0}; fi'.format(mount_path) return cmd.split() + @staticmethod + def tcmu_runner_entrypoint_script() -> str: + # since we are having tcmu-runner be a background + # process in its systemd unit (rbd-target-api being + # the main process) systemd will not restart it when + # it fails. in order to try and get around that for now + # we can have a script mounted in the container that + # that attempts to do the restarting for us. This script + # can then become the entrypoint for the tcmu-runner + # container + + # This is intended to be dropped for a better solution + # for at least the squid release onward + return """#!/bin/bash +RUN_DIR=/var/run/tcmu-runner + +if [ ! -d "${RUN_DIR}" ] ; then + mkdir -p "${RUN_DIR}" +fi + +rm -rf "${RUN_DIR}"/* + +while true +do + touch "${RUN_DIR}"/start-up-$(date -Ins) + /usr/bin/tcmu-runner + + # If we got around 3 kills/segfaults in the last minute, + # don't start anymore + if [ $(find "${RUN_DIR}" -type f -cmin -1 | wc -l) -ge 3 ] ; then + exit 0 + fi + + sleep 1 +done +""" + def get_tcmu_runner_container(self): # type: () -> CephContainer - tcmu_container = get_container(self.ctx, self.fsid, self.daemon_type, self.daemon_id) - tcmu_container.entrypoint = '/usr/bin/tcmu-runner' + tcmu_container = get_deployment_container(self.ctx, self.fsid, self.daemon_type, self.daemon_id) + tcmu_container.entrypoint = '/usr/local/scripts/tcmu-runner-entrypoint.sh' tcmu_container.cname = self.get_container_name(desc='tcmu') # remove extra container args for tcmu container. # extra args could cause issue with forking service type @@ -2545,6 +2604,17 @@ def _write_custom_conf_files(ctx: CephadmContext, daemon_type: str, daemon_id: s os.fchown(f.fileno(), uid, gid) os.fchmod(f.fileno(), 0o600) f.write(ccf['content']) + # temporary workaround to make custom config files work for tcmu-runner + # container we deploy with iscsi until iscsi is refactored + if daemon_type == 'iscsi': + tcmu_config_dir = custom_config_dir + '.tcmu' + if not os.path.exists(tcmu_config_dir): + makedirs(tcmu_config_dir, uid, gid, 0o755) + tcmu_file_path = os.path.join(tcmu_config_dir, os.path.basename(ccf['mount_path'])) + with open(tcmu_file_path, 'w', encoding='utf-8') as f: + os.fchown(f.fileno(), uid, gid) + os.fchmod(f.fileno(), 0o600) + f.write(ccf['content']) def get_parm(option: str) -> Dict[str, str]: @@ -2960,26 +3030,29 @@ def extract_uid_gid(ctx, img='', file_path='/var/lib/ceph'): raise RuntimeError('uid/gid not found') -def deploy_daemon(ctx, fsid, daemon_type, daemon_id, c, uid, gid, - config=None, keyring=None, - osd_fsid=None, - reconfig=False, - ports=None): - # type: (CephadmContext, str, str, Union[int, str], Optional[CephContainer], int, int, Optional[str], Optional[str], Optional[str], Optional[bool], Optional[List[int]]) -> None +def deploy_daemon(ctx: CephadmContext, fsid: str, daemon_type: str, + daemon_id: Union[int, str], c: Optional['CephContainer'], + uid: int, gid: int, config: Optional[str] = None, + keyring: Optional[str] = None, osd_fsid: Optional[str] = None, + deployment_type: DeploymentType = DeploymentType.DEFAULT, + ports: Optional[List[int]] = None) -> None: ports = ports or [] - if any([port_in_use(ctx, port) for port in ports]): - if daemon_type == 'mgr': - # non-fatal for mgr when we are in mgr_standby_modules=false, but we can't - # tell whether that is the case here. - logger.warning( - f"ceph-mgr TCP port(s) {','.join(map(str, ports))} already in use" - ) - else: - raise Error("TCP Port(s) '{}' required for {} already in use".format(','.join(map(str, ports)), daemon_type)) + # only check port in use if fresh deployment since service + # we are redeploying/reconfiguring will already be using the port + if deployment_type == DeploymentType.DEFAULT: + if any([port_in_use(ctx, port) for port in ports]): + if daemon_type == 'mgr': + # non-fatal for mgr when we are in mgr_standby_modules=false, but we can't + # tell whether that is the case here. + logger.warning( + f"ceph-mgr TCP port(s) {','.join(map(str, ports))} already in use" + ) + else: + raise Error("TCP Port(s) '{}' required for {} already in use".format(','.join(map(str, ports)), daemon_type)) data_dir = get_data_dir(fsid, ctx.data_dir, daemon_type, daemon_id) - if reconfig and not os.path.exists(data_dir): + if deployment_type == DeploymentType.RECONFIG and not os.path.exists(data_dir): raise Error('cannot reconfig, data path %s does not exist' % data_dir) if daemon_type == 'mon' and not os.path.exists(data_dir): assert config @@ -3026,7 +3099,9 @@ def deploy_daemon(ctx, fsid, daemon_type, daemon_id, c, uid, gid, uid, gid, config, keyring) - if not reconfig: + # only write out unit files and start daemon + # with systemd if this is not a reconfig + if deployment_type != DeploymentType.RECONFIG: if daemon_type == CephadmDaemon.daemon_type: port = next(iter(ports), None) # get first tcp port provided or None @@ -3065,7 +3140,9 @@ def deploy_daemon(ctx, fsid, daemon_type, daemon_id, c, uid, gid, fw.open_ports(ports + fw.external_ports.get(daemon_type, [])) fw.apply_rules() - if reconfig and daemon_type not in Ceph.daemons: + # If this was a reconfig and the daemon is not a Ceph daemon, restart it + # so it can pick up potential changes to its configuration files + if deployment_type == DeploymentType.RECONFIG and daemon_type not in Ceph.daemons: # ceph daemons do not need a restart; others (presumably) do to pick # up the new config call_throws(ctx, ['systemctl', 'reset-failed', @@ -3556,6 +3633,18 @@ def install_base_units(ctx, fsid): first child (bash), but that isn't the ceph daemon. This is simpler and should be harmless. """ + targets: List[str] = [ + 'ceph-mon', + 'ceph-mgr', + 'ceph-mds', + 'ceph-osd', + 'ceph-fuse', + 'radosgw', + 'rbd-mirror', + 'cephfs-mirror', + 'tcmu-runner' + ] + f.write("""# created by cephadm /var/log/ceph/%s/*.log { rotate 7 @@ -3563,13 +3652,13 @@ def install_base_units(ctx, fsid): compress sharedscripts postrotate - killall -q -1 ceph-mon ceph-mgr ceph-mds ceph-osd ceph-fuse radosgw rbd-mirror cephfs-mirror || pkill -1 -x 'ceph-mon|ceph-mgr|ceph-mds|ceph-osd|ceph-fuse|radosgw|rbd-mirror|cephfs-mirror' || true + killall -q -1 %s || pkill -1 -x '%s' || true endscript missingok notifempty su root root } -""" % fsid) +""" % (fsid, ' '.join(targets), '|'.join(targets))) def get_unit_file(ctx, fsid): @@ -4697,8 +4786,10 @@ def finish_bootstrap_config( cli(['config', 'set', 'global', 'container_image', f'{ctx.image}']) if mon_network: - logger.info(f'Setting mon public_network to {mon_network}') - cli(['config', 'set', 'mon', 'public_network', mon_network]) + cp = read_config(ctx.config) + cfg_section = 'global' if cp.has_option('global', 'public_network') else 'mon' + logger.info(f'Setting public_network to {mon_network} in {cfg_section} config section') + cli(['config', 'set', cfg_section, 'public_network', mon_network]) if cluster_network: logger.info(f'Setting cluster_network to {cluster_network}') @@ -5126,6 +5217,24 @@ def get_deployment_container(ctx: CephadmContext, return c +def get_deployment_type(ctx: CephadmContext, daemon_type: str, daemon_id: str) -> DeploymentType: + deployment_type: DeploymentType = DeploymentType.DEFAULT + if ctx.reconfig: + deployment_type = DeploymentType.RECONFIG + unit_name = get_unit_name(ctx.fsid, daemon_type, daemon_id) + (_, state, _) = check_unit(ctx, unit_name) + if state == 'running' or is_container_running(ctx, CephContainer.for_daemon(ctx, ctx.fsid, daemon_type, daemon_id, 'bash')): + # if reconfig was set, that takes priority over redeploy. If + # this is considered a fresh deployment at this stage, + # mark it as a redeploy to avoid port checking + if deployment_type == DeploymentType.DEFAULT: + deployment_type = DeploymentType.REDEPLOY + + logger.info(f'{deployment_type.value} daemon {ctx.name} ...') + + return deployment_type + + @default_image def command_deploy(ctx): # type: (CephadmContext) -> None @@ -5137,18 +5246,7 @@ def command_deploy(ctx): if daemon_type not in get_supported_daemons(): raise Error('daemon type %s not recognized' % daemon_type) - redeploy = False - unit_name = get_unit_name(ctx.fsid, daemon_type, daemon_id) - (_, state, _) = check_unit(ctx, unit_name) - if state == 'running' or is_container_running(ctx, CephContainer.for_daemon(ctx, ctx.fsid, daemon_type, daemon_id, 'bash')): - redeploy = True - - if ctx.reconfig: - logger.info('%s daemon %s ...' % ('Reconfig', ctx.name)) - elif redeploy: - logger.info('%s daemon %s ...' % ('Redeploy', ctx.name)) - else: - logger.info('%s daemon %s ...' % ('Deploy', ctx.name)) + deployment_type: DeploymentType = get_deployment_type(ctx, daemon_type, daemon_id) # Migrate sysctl conf files from /usr/lib to /etc migrate_sysctl_dir(ctx, ctx.fsid) @@ -5156,11 +5254,8 @@ def command_deploy(ctx): # Get and check ports explicitly required to be opened daemon_ports = [] # type: List[int] - # only check port in use if not reconfig or redeploy since service - # we are redeploying/reconfiguring will already be using the port - if not ctx.reconfig and not redeploy: - if ctx.tcp_ports: - daemon_ports = list(map(int, ctx.tcp_ports.split())) + if ctx.tcp_ports: + daemon_ports = list(map(int, ctx.tcp_ports.split())) if daemon_type in Ceph.daemons: config, keyring = get_config_and_keyring(ctx) @@ -5172,7 +5267,7 @@ def command_deploy(ctx): deploy_daemon(ctx, ctx.fsid, daemon_type, daemon_id, c, uid, gid, config=config, keyring=keyring, osd_fsid=ctx.osd_fsid, - reconfig=ctx.reconfig, + deployment_type=deployment_type, ports=daemon_ports) elif daemon_type in Monitoring.components: @@ -5194,11 +5289,12 @@ def command_deploy(ctx): uid, gid = extract_uid_gid_monitoring(ctx, daemon_type) c = get_deployment_container(ctx, ctx.fsid, daemon_type, daemon_id) deploy_daemon(ctx, ctx.fsid, daemon_type, daemon_id, c, uid, gid, - reconfig=ctx.reconfig, + deployment_type=deployment_type, ports=daemon_ports) elif daemon_type == NFSGanesha.daemon_type: - if not ctx.reconfig and not redeploy and not daemon_ports: + # only check ports if this is a fresh deployment + if deployment_type == DeploymentType.DEFAULT and not daemon_ports: daemon_ports = list(NFSGanesha.port_map.values()) config, keyring = get_config_and_keyring(ctx) @@ -5207,7 +5303,7 @@ def command_deploy(ctx): c = get_deployment_container(ctx, ctx.fsid, daemon_type, daemon_id) deploy_daemon(ctx, ctx.fsid, daemon_type, daemon_id, c, uid, gid, config=config, keyring=keyring, - reconfig=ctx.reconfig, + deployment_type=deployment_type, ports=daemon_ports) elif daemon_type == CephIscsi.daemon_type: @@ -5216,7 +5312,7 @@ def command_deploy(ctx): c = get_deployment_container(ctx, ctx.fsid, daemon_type, daemon_id) deploy_daemon(ctx, ctx.fsid, daemon_type, daemon_id, c, uid, gid, config=config, keyring=keyring, - reconfig=ctx.reconfig, + deployment_type=deployment_type, ports=daemon_ports) elif daemon_type == HAproxy.daemon_type: @@ -5224,7 +5320,7 @@ def command_deploy(ctx): uid, gid = haproxy.extract_uid_gid_haproxy() c = get_deployment_container(ctx, ctx.fsid, daemon_type, daemon_id) deploy_daemon(ctx, ctx.fsid, daemon_type, daemon_id, c, uid, gid, - reconfig=ctx.reconfig, + deployment_type=deployment_type, ports=daemon_ports) elif daemon_type == Keepalived.daemon_type: @@ -5232,19 +5328,20 @@ def command_deploy(ctx): uid, gid = keepalived.extract_uid_gid_keepalived() c = get_deployment_container(ctx, ctx.fsid, daemon_type, daemon_id) deploy_daemon(ctx, ctx.fsid, daemon_type, daemon_id, c, uid, gid, - reconfig=ctx.reconfig, + deployment_type=deployment_type, ports=daemon_ports) elif daemon_type == CustomContainer.daemon_type: cc = CustomContainer.init(ctx, ctx.fsid, daemon_id) - if not ctx.reconfig and not redeploy: + # only check ports if this is a fresh deployment + if deployment_type == DeploymentType.DEFAULT: daemon_ports.extend(cc.ports) c = get_deployment_container(ctx, ctx.fsid, daemon_type, daemon_id, privileged=cc.privileged, ptrace=ctx.allow_ptrace) deploy_daemon(ctx, ctx.fsid, daemon_type, daemon_id, c, uid=cc.uid, gid=cc.gid, config=None, - keyring=None, reconfig=ctx.reconfig, + keyring=None, deployment_type=deployment_type, ports=daemon_ports) elif daemon_type == CephadmDaemon.daemon_type: @@ -5259,13 +5356,16 @@ def command_deploy(ctx): CephadmDaemon.validate_config(config_js) deploy_daemon(ctx, ctx.fsid, daemon_type, daemon_id, None, - uid, gid, ports=daemon_ports) + uid, gid, + deployment_type=deployment_type, + ports=daemon_ports) elif daemon_type == SNMPGateway.daemon_type: sc = SNMPGateway.init(ctx, ctx.fsid, daemon_id) c = get_deployment_container(ctx, ctx.fsid, daemon_type, daemon_id) deploy_daemon(ctx, ctx.fsid, daemon_type, daemon_id, c, sc.uid, sc.gid, + deployment_type=deployment_type, ports=daemon_ports) else: @@ -6186,6 +6286,9 @@ def command_adopt_prometheus(ctx, daemon_id, fsid): # type: (CephadmContext, str, str) -> None daemon_type = 'prometheus' (uid, gid) = extract_uid_gid_monitoring(ctx, daemon_type) + # should try to set the ports we know cephadm defaults + # to for these services in the firewall. + ports = Monitoring.port_map['prometheus'] _stop_and_disable(ctx, 'prometheus') @@ -6207,7 +6310,8 @@ def command_adopt_prometheus(ctx, daemon_id, fsid): make_var_run(ctx, fsid, uid, gid) c = get_container(ctx, fsid, daemon_type, daemon_id) - deploy_daemon(ctx, fsid, daemon_type, daemon_id, c, uid, gid) + deploy_daemon(ctx, fsid, daemon_type, daemon_id, c, uid, gid, + deployment_type=DeploymentType.REDEPLOY, ports=ports) update_firewalld(ctx, daemon_type) @@ -6216,6 +6320,9 @@ def command_adopt_grafana(ctx, daemon_id, fsid): daemon_type = 'grafana' (uid, gid) = extract_uid_gid_monitoring(ctx, daemon_type) + # should try to set the ports we know cephadm defaults + # to for these services in the firewall. + ports = Monitoring.port_map['grafana'] _stop_and_disable(ctx, 'grafana-server') @@ -6261,7 +6368,8 @@ def command_adopt_grafana(ctx, daemon_id, fsid): make_var_run(ctx, fsid, uid, gid) c = get_container(ctx, fsid, daemon_type, daemon_id) - deploy_daemon(ctx, fsid, daemon_type, daemon_id, c, uid, gid) + deploy_daemon(ctx, fsid, daemon_type, daemon_id, c, uid, gid, + deployment_type=DeploymentType.REDEPLOY, ports=ports) update_firewalld(ctx, daemon_type) @@ -6270,6 +6378,9 @@ def command_adopt_alertmanager(ctx, daemon_id, fsid): daemon_type = 'alertmanager' (uid, gid) = extract_uid_gid_monitoring(ctx, daemon_type) + # should try to set the ports we know cephadm defaults + # to for these services in the firewall. + ports = Monitoring.port_map['alertmanager'] _stop_and_disable(ctx, 'prometheus-alertmanager') @@ -6291,7 +6402,8 @@ def command_adopt_alertmanager(ctx, daemon_id, fsid): make_var_run(ctx, fsid, uid, gid) c = get_container(ctx, fsid, daemon_type, daemon_id) - deploy_daemon(ctx, fsid, daemon_type, daemon_id, c, uid, gid) + deploy_daemon(ctx, fsid, daemon_type, daemon_id, c, uid, gid, + deployment_type=DeploymentType.REDEPLOY, ports=ports) update_firewalld(ctx, daemon_type) diff --git a/ceph/src/cephadm/tests/test_cephadm.py b/ceph/src/cephadm/tests/test_cephadm.py index d2fb87424..ec526a05e 100644 --- a/ceph/src/cephadm/tests/test_cephadm.py +++ b/ceph/src/cephadm/tests/test_cephadm.py @@ -1712,11 +1712,11 @@ if ! grep -qs /var/lib/ceph/9b9d7609-f4d5-4aba-94c8-effa764d96c9/iscsi.daemon_id # iscsi tcmu-runner container ! /usr/bin/podman rm -f ceph-9b9d7609-f4d5-4aba-94c8-effa764d96c9-iscsi.daemon_id-tcmu 2> /dev/null ! /usr/bin/podman rm -f ceph-9b9d7609-f4d5-4aba-94c8-effa764d96c9-iscsi-daemon_id-tcmu 2> /dev/null -/usr/bin/podman run --rm --ipc=host --stop-signal=SIGTERM --net=host --entrypoint /usr/bin/tcmu-runner --privileged --group-add=disk --init --name ceph-9b9d7609-f4d5-4aba-94c8-effa764d96c9-iscsi-daemon_id-tcmu --pids-limit=0 -e CONTAINER_IMAGE=ceph/ceph -e NODE_NAME=host1 -e CEPH_USE_RANDOM_NONCE=1 -v /var/lib/ceph/9b9d7609-f4d5-4aba-94c8-effa764d96c9/iscsi.daemon_id/config:/etc/ceph/ceph.conf:z -v /var/lib/ceph/9b9d7609-f4d5-4aba-94c8-effa764d96c9/iscsi.daemon_id/keyring:/etc/ceph/keyring:z -v /var/lib/ceph/9b9d7609-f4d5-4aba-94c8-effa764d96c9/iscsi.daemon_id/iscsi-gateway.cfg:/etc/ceph/iscsi-gateway.cfg:z -v /var/lib/ceph/9b9d7609-f4d5-4aba-94c8-effa764d96c9/iscsi.daemon_id/configfs:/sys/kernel/config -v /var/log/ceph/9b9d7609-f4d5-4aba-94c8-effa764d96c9:/var/log:z -v /dev:/dev --mount type=bind,source=/lib/modules,destination=/lib/modules,ro=true ceph/ceph & +/usr/bin/podman run --rm --ipc=host --stop-signal=SIGTERM --net=host --entrypoint /usr/local/scripts/tcmu-runner-entrypoint.sh --privileged --group-add=disk --init --name ceph-9b9d7609-f4d5-4aba-94c8-effa764d96c9-iscsi-daemon_id-tcmu --pids-limit=0 -e CONTAINER_IMAGE=ceph/ceph -e NODE_NAME=host1 -e CEPH_USE_RANDOM_NONCE=1 -v /var/lib/ceph/9b9d7609-f4d5-4aba-94c8-effa764d96c9/iscsi.daemon_id/config:/etc/ceph/ceph.conf:z -v /var/lib/ceph/9b9d7609-f4d5-4aba-94c8-effa764d96c9/iscsi.daemon_id/keyring:/etc/ceph/keyring:z -v /var/lib/ceph/9b9d7609-f4d5-4aba-94c8-effa764d96c9/iscsi.daemon_id/iscsi-gateway.cfg:/etc/ceph/iscsi-gateway.cfg:z -v /var/lib/ceph/9b9d7609-f4d5-4aba-94c8-effa764d96c9/iscsi.daemon_id/configfs:/sys/kernel/config -v /var/lib/ceph/9b9d7609-f4d5-4aba-94c8-effa764d96c9/iscsi.daemon_id/tcmu-runner-entrypoint.sh:/usr/local/scripts/tcmu-runner-entrypoint.sh -v /var/log/ceph/9b9d7609-f4d5-4aba-94c8-effa764d96c9:/var/log:z -v /dev:/dev --mount type=bind,source=/lib/modules,destination=/lib/modules,ro=true ceph/ceph & # iscsi.daemon_id ! /usr/bin/podman rm -f ceph-9b9d7609-f4d5-4aba-94c8-effa764d96c9-iscsi.daemon_id 2> /dev/null ! /usr/bin/podman rm -f ceph-9b9d7609-f4d5-4aba-94c8-effa764d96c9-iscsi-daemon_id 2> /dev/null -/usr/bin/podman run --rm --ipc=host --stop-signal=SIGTERM --net=host --entrypoint /usr/bin/rbd-target-api --privileged --group-add=disk --init --name ceph-9b9d7609-f4d5-4aba-94c8-effa764d96c9-iscsi-daemon_id --pids-limit=0 -e CONTAINER_IMAGE=ceph/ceph -e NODE_NAME=host1 -e CEPH_USE_RANDOM_NONCE=1 -v /var/lib/ceph/9b9d7609-f4d5-4aba-94c8-effa764d96c9/iscsi.daemon_id/config:/etc/ceph/ceph.conf:z -v /var/lib/ceph/9b9d7609-f4d5-4aba-94c8-effa764d96c9/iscsi.daemon_id/keyring:/etc/ceph/keyring:z -v /var/lib/ceph/9b9d7609-f4d5-4aba-94c8-effa764d96c9/iscsi.daemon_id/iscsi-gateway.cfg:/etc/ceph/iscsi-gateway.cfg:z -v /var/lib/ceph/9b9d7609-f4d5-4aba-94c8-effa764d96c9/iscsi.daemon_id/configfs:/sys/kernel/config -v /var/log/ceph/9b9d7609-f4d5-4aba-94c8-effa764d96c9:/var/log:z -v /dev:/dev --mount type=bind,source=/lib/modules,destination=/lib/modules,ro=true ceph/ceph +/usr/bin/podman run --rm --ipc=host --stop-signal=SIGTERM --net=host --entrypoint /usr/bin/rbd-target-api --privileged --group-add=disk --init --name ceph-9b9d7609-f4d5-4aba-94c8-effa764d96c9-iscsi-daemon_id --pids-limit=0 -e CONTAINER_IMAGE=ceph/ceph -e NODE_NAME=host1 -e CEPH_USE_RANDOM_NONCE=1 -v /var/lib/ceph/9b9d7609-f4d5-4aba-94c8-effa764d96c9/iscsi.daemon_id/config:/etc/ceph/ceph.conf:z -v /var/lib/ceph/9b9d7609-f4d5-4aba-94c8-effa764d96c9/iscsi.daemon_id/keyring:/etc/ceph/keyring:z -v /var/lib/ceph/9b9d7609-f4d5-4aba-94c8-effa764d96c9/iscsi.daemon_id/iscsi-gateway.cfg:/etc/ceph/iscsi-gateway.cfg:z -v /var/lib/ceph/9b9d7609-f4d5-4aba-94c8-effa764d96c9/iscsi.daemon_id/configfs:/sys/kernel/config -v /var/lib/ceph/9b9d7609-f4d5-4aba-94c8-effa764d96c9/iscsi.daemon_id/tcmu-runner-entrypoint.sh:/usr/local/scripts/tcmu-runner-entrypoint.sh -v /var/log/ceph/9b9d7609-f4d5-4aba-94c8-effa764d96c9:/var/log:z -v /dev:/dev --mount type=bind,source=/lib/modules,destination=/lib/modules,ro=true ceph/ceph """ def test_get_container(self): diff --git a/ceph/src/client/Client.cc b/ceph/src/client/Client.cc index 1944e6081..d43336180 100644 --- a/ceph/src/client/Client.cc +++ b/ceph/src/client/Client.cc @@ -2322,6 +2322,12 @@ void Client::_closed_mds_session(MetaSession *s, int err, bool rejected) mds_sessions.erase(s->mds_num); } +static void reinit_mds_features(MetaSession *session, + const MConstRef& m) { + session->mds_features = std::move(m->supported_features); + session->mds_metric_flags = std::move(m->metric_spec.metric_flags); +} + void Client::handle_client_session(const MConstRef& m) { mds_rank_t from = mds_rank_t(m->get_source().num()); @@ -2340,6 +2346,13 @@ void Client::handle_client_session(const MConstRef& m) if (session->state == MetaSession::STATE_OPEN) { ldout(cct, 10) << "mds." << from << " already opened, ignore it" << dendl; + // The MDS could send a client_session(open) message even when + // the session state is STATE_OPEN. Normally, its fine to + // ignore this message, but, if the MDS sent this message just + // after it got upgraded, the MDS feature bits could differ + // than the one before the upgrade - so, refresh the feature + // bits the client holds. + reinit_mds_features(session, m); return; } /* @@ -2358,8 +2371,7 @@ void Client::handle_client_session(const MConstRef& m) _closed_mds_session(session, -CEPHFS_EPERM, true); break; } - session->mds_features = std::move(m->supported_features); - session->mds_metric_flags = std::move(m->metric_spec.metric_flags); + reinit_mds_features(session, m); renew_caps(session); session->state = MetaSession::STATE_OPEN; @@ -3980,6 +3992,7 @@ void Client::check_caps(Inode *in, unsigned flags) flush_tid = 0; } + in->delay_cap_item.remove_myself(); send_cap(in, session, &cap, msg_flags, cap_used, wanted, retain, flushing, flush_tid); } @@ -4405,11 +4418,19 @@ void Client::add_update_cap(Inode *in, MetaSession *mds_session, uint64_t cap_id if (flags & CEPH_CAP_FLAG_AUTH) { if (in->auth_cap != &cap && (!in->auth_cap || ceph_seq_cmp(in->auth_cap->mseq, mseq) < 0)) { - if (in->auth_cap && in->flushing_cap_item.is_on_list()) { - ldout(cct, 10) << __func__ << " changing auth cap: " - << "add myself to new auth MDS' flushing caps list" << dendl; - adjust_session_flushing_caps(in, in->auth_cap->session, mds_session); + if (in->auth_cap) { + if (in->flushing_cap_item.is_on_list()) { + ldout(cct, 10) << __func__ << " changing auth cap: " + << "add myself to new auth MDS' flushing caps list" << dendl; + adjust_session_flushing_caps(in, in->auth_cap->session, mds_session); + } + if (in->dirty_cap_item.is_on_list()) { + ldout(cct, 10) << __func__ << " changing auth cap: " + << "add myself to new auth MDS' dirty caps list" << dendl; + mds_session->get_dirty_list().push_back(&in->dirty_cap_item); + } } + in->auth_cap = ∩ } } @@ -4691,6 +4712,9 @@ void Client::trim_caps(MetaSession *s, uint64_t max) // is deleted inside remove_cap ++p; + if (in->dirty_caps || in->cap_snaps.size()) + cap_delay_requeue(in.get()); + if (in->caps.size() > 1 && cap != in->auth_cap) { int mine = cap->issued | cap->implemented; int oissued = in->auth_cap ? in->auth_cap->issued : 0; @@ -4728,7 +4752,8 @@ void Client::trim_caps(MetaSession *s, uint64_t max) } if (all && in->ino != CEPH_INO_ROOT) { ldout(cct, 20) << __func__ << " counting as trimmed: " << *in << dendl; - trimmed++; + if (!in->dirty_caps && !in->cap_snaps.size()) + trimmed++; } } } @@ -4800,35 +4825,25 @@ void Client::adjust_session_flushing_caps(Inode *in, MetaSession *old_s, MetaSe } /* - * Flush all caps back to the MDS. Because the callers generally wait on the - * result of this function (syncfs and umount cases), we set - * CHECK_CAPS_SYNCHRONOUS on the last check_caps call. + * Flush all the dirty caps back to the MDS. Because the callers + * generally wait on the result of this function (syncfs and umount + * cases), we set CHECK_CAPS_SYNCHRONOUS on the last check_caps call. */ void Client::flush_caps_sync() { ldout(cct, 10) << __func__ << dendl; - xlist::iterator p = delayed_list.begin(); - while (!p.end()) { - unsigned flags = CHECK_CAPS_NODELAY; - Inode *in = *p; + for (auto &q : mds_sessions) { + auto &s = q.second; + xlist::iterator p = s.dirty_list.begin(); + while (!p.end()) { + unsigned flags = CHECK_CAPS_NODELAY; + Inode *in = *p; - ++p; - delayed_list.pop_front(); - if (p.end() && dirty_list.empty()) - flags |= CHECK_CAPS_SYNCHRONOUS; - check_caps(in, flags); - } - - // other caps, too - p = dirty_list.begin(); - while (!p.end()) { - unsigned flags = CHECK_CAPS_NODELAY; - Inode *in = *p; - - ++p; - if (p.end()) - flags |= CHECK_CAPS_SYNCHRONOUS; - check_caps(in, flags); + ++p; + if (p.end()) + flags |= CHECK_CAPS_SYNCHRONOUS; + check_caps(in, flags); + } } } @@ -5261,24 +5276,48 @@ void Client::handle_caps(const MConstRef& m) got_mds_push(session); + bool do_cap_release = false; Inode *in; vinodeno_t vino(m->get_ino(), CEPH_NOSNAP); if (auto it = inode_map.find(vino); it != inode_map.end()) { in = it->second; - } else { - if (m->get_op() == CEPH_CAP_OP_IMPORT) { - ldout(cct, 5) << __func__ << " don't have vino " << vino << " on IMPORT, immediately releasing" << dendl; - session->enqueue_cap_release( - m->get_ino(), - m->get_cap_id(), - m->get_seq(), - m->get_mseq(), - cap_epoch_barrier); - } else { - ldout(cct, 5) << __func__ << " don't have vino " << vino << ", dropping" << dendl; - } - // in case the mds is waiting on e.g. a revocation + /* MDS maybe waiting for cap release with increased seq */ + switch (m->get_op()) { + case CEPH_CAP_OP_REVOKE: + case CEPH_CAP_OP_GRANT: + if (!in->caps.count(mds)) { + do_cap_release = true; + ldout(cct, 5) << __func__ << " vino " << vino << " don't have cap " + << m->get_cap_id() << " op " << m->get_op() + << ", immediately releasing" << dendl; + } + } + } else { + /* MDS maybe waiting for cap release with increased seq */ + switch (m->get_op()) { + case CEPH_CAP_OP_IMPORT: + case CEPH_CAP_OP_REVOKE: + case CEPH_CAP_OP_GRANT: + do_cap_release = true; + ldout(cct, 5) << __func__ << " don't have vino " << vino << " op " + << m->get_op() << ", immediately releasing" << dendl; + break; + default: + ldout(cct, 5) << __func__ << " don't have vino " << vino << ", dropping" << dendl; + return; + } + } + + // In case the mds is waiting on e.g. a revocation + if (do_cap_release) { + session->enqueue_cap_release( + m->get_ino(), + m->get_cap_id(), + m->get_seq(), + m->get_mseq(), + cap_epoch_barrier); + flush_cap_releases(); return; } @@ -6662,13 +6701,16 @@ void Client::_unmount(bool abort) } if (abort || blocklisted) { - for (auto p = dirty_list.begin(); !p.end(); ) { - Inode *in = *p; - ++p; - if (in->dirty_caps) { - ldout(cct, 0) << " drop dirty caps on " << *in << dendl; - in->mark_caps_clean(); - put_inode(in); + for (auto &q : mds_sessions) { + auto &s = q.second; + for (auto p = s.dirty_list.begin(); !p.end(); ) { + Inode *in = *p; + ++p; + if (in->dirty_caps) { + ldout(cct, 0) << " drop dirty caps on " << *in << dendl; + in->mark_caps_clean(); + put_inode(in); + } } } } else { diff --git a/ceph/src/client/Client.h b/ceph/src/client/Client.h index a2a8ed8d0..3f737c93f 100644 --- a/ceph/src/client/Client.h +++ b/ceph/src/client/Client.h @@ -883,8 +883,6 @@ public: return std::make_pair(opened_inodes, inode_map.size()); } - xlist &get_dirty_list() { return dirty_list; } - /* timer_lock for 'timer' */ ceph::mutex timer_lock = ceph::make_mutex("Client::timer_lock"); SafeTimer timer; @@ -1548,8 +1546,7 @@ private: // cap flushing ceph_tid_t last_flush_tid = 1; - // dirty_list keeps all the dirty inodes before flushing. - xlist delayed_list, dirty_list; + xlist delayed_list; int num_flushing_caps = 0; ceph::unordered_map snap_realms; std::map metadata; diff --git a/ceph/src/client/Inode.cc b/ceph/src/client/Inode.cc index da68c24c4..dd2a711f7 100644 --- a/ceph/src/client/Inode.cc +++ b/ceph/src/client/Inode.cc @@ -772,12 +772,27 @@ void Inode::unset_deleg(Fh *fh) */ void Inode::mark_caps_dirty(int caps) { + /* + * If auth_cap is nullptr means the reonnecting is not finished or + * already rejected. + */ + if (!auth_cap) { + ceph_assert(!dirty_caps); + + lsubdout(client->cct, client, 1) << __func__ << " " << *this << " dirty caps '" << ccap_string(caps) + << "', but no auth cap." << dendl; + return; + } + lsubdout(client->cct, client, 10) << __func__ << " " << *this << " " << ccap_string(dirty_caps) << " -> " << ccap_string(dirty_caps | caps) << dendl; + if (caps && !caps_dirty()) iget(); + dirty_caps |= caps; - client->get_dirty_list().push_back(&dirty_cap_item); + auth_cap->session->get_dirty_list().push_back(&dirty_cap_item); + client->cap_delay_requeue(this); } /** diff --git a/ceph/src/client/MetaSession.h b/ceph/src/client/MetaSession.h index d9a1e481a..89a179981 100644 --- a/ceph/src/client/MetaSession.h +++ b/ceph/src/client/MetaSession.h @@ -50,6 +50,8 @@ struct MetaSession { list waiting_for_open; xlist caps; + // dirty_list keeps all the dirty inodes before flushing in current session. + xlist dirty_list; xlist flushing_caps; xlist requests; xlist unsafe_requests; @@ -60,6 +62,15 @@ struct MetaSession { MetaSession(mds_rank_t mds_num, ConnectionRef con, const entity_addrvec_t& addrs) : mds_num(mds_num), con(con), addrs(addrs) { } + ~MetaSession() { + ceph_assert(caps.empty()); + ceph_assert(dirty_list.empty()); + ceph_assert(flushing_caps.empty()); + ceph_assert(requests.empty()); + ceph_assert(unsafe_requests.empty()); + } + + xlist &get_dirty_list() { return dirty_list; } const char *get_state_name() const; diff --git a/ceph/src/cls/rgw/cls_rgw.cc b/ceph/src/cls/rgw/cls_rgw.cc index 10068f4cd..04e4b53c2 100644 --- a/ceph/src/cls/rgw/cls_rgw.cc +++ b/ceph/src/cls/rgw/cls_rgw.cc @@ -667,75 +667,6 @@ int rgw_bucket_list(cls_method_context_t hctx, bufferlist *in, bufferlist *out) } } // rgw_bucket_list - -static int check_index(cls_method_context_t hctx, - rgw_bucket_dir_header *existing_header, - rgw_bucket_dir_header *calc_header) -{ - int rc = read_bucket_header(hctx, existing_header); - if (rc < 0) { - CLS_LOG(1, "ERROR: check_index(): failed to read header\n"); - return rc; - } - - calc_header->tag_timeout = existing_header->tag_timeout; - calc_header->ver = existing_header->ver; - calc_header->syncstopped = existing_header->syncstopped; - - map keys; - string start_obj; - string filter_prefix; - -#define CHECK_CHUNK_SIZE 1000 - bool done = false; - bool more; - - do { - rc = get_obj_vals(hctx, start_obj, filter_prefix, CHECK_CHUNK_SIZE, &keys, &more); - if (rc < 0) - return rc; - - for (auto kiter = keys.begin(); kiter != keys.end(); ++kiter) { - if (!bi_is_plain_entry(kiter->first)) { - done = true; - break; - } - - rgw_bucket_dir_entry entry; - auto eiter = kiter->second.cbegin(); - try { - decode(entry, eiter); - } catch (ceph::buffer::error& err) { - CLS_LOG(1, "ERROR: rgw_bucket_list(): failed to decode entry, key=%s", kiter->first.c_str()); - return -EIO; - } - rgw_bucket_category_stats& stats = calc_header->stats[entry.meta.category]; - stats.num_entries++; - stats.total_size += entry.meta.accounted_size; - stats.total_size_rounded += cls_rgw_get_rounded_size(entry.meta.accounted_size); - stats.actual_size += entry.meta.size; - - start_obj = kiter->first; - } - } while (keys.size() == CHECK_CHUNK_SIZE && !done); - - return 0; -} - -int rgw_bucket_check_index(cls_method_context_t hctx, bufferlist *in, bufferlist *out) -{ - CLS_LOG(10, "entered %s", __func__); - rgw_cls_check_index_ret ret; - - int rc = check_index(hctx, &ret.existing_header, &ret.calculated_header); - if (rc < 0) - return rc; - - encode(ret, *out); - - return 0; -} - static int write_bucket_header(cls_method_context_t hctx, rgw_bucket_dir_header *header) { header->ver++; @@ -745,19 +676,6 @@ static int write_bucket_header(cls_method_context_t hctx, rgw_bucket_dir_header return cls_cxx_map_write_header(hctx, &header_bl); } - -int rgw_bucket_rebuild_index(cls_method_context_t hctx, bufferlist *in, bufferlist *out) -{ - CLS_LOG(10, "entered %s()\n", __func__); - rgw_bucket_dir_header existing_header; - rgw_bucket_dir_header calc_header; - int rc = check_index(hctx, &existing_header, &calc_header); - if (rc < 0) - return rc; - - return write_bucket_header(hctx, &calc_header); -} - int rgw_bucket_update_stats(cls_method_context_t hctx, bufferlist *in, bufferlist *out) { CLS_LOG(10, "entered %s()\n", __func__); @@ -1776,6 +1694,9 @@ static int rgw_bucket_link_olh(cls_method_context_t hctx, bufferlist *in, buffer return ret; } olh.set_tag(op.olh_tag); + if (op.key.instance.empty()){ + obj.set_epoch(1); + } } /* update the olh log */ @@ -2933,6 +2854,115 @@ static int list_olh_entries(cls_method_context_t hctx, return count; } +static int check_index(cls_method_context_t hctx, + rgw_bucket_dir_header *existing_header, + rgw_bucket_dir_header *calc_header) +{ + int rc = read_bucket_header(hctx, existing_header); + if (rc < 0) { + CLS_LOG(1, "ERROR: check_index(): failed to read header\n"); + return rc; + } + + calc_header->tag_timeout = existing_header->tag_timeout; + calc_header->ver = existing_header->ver; + calc_header->syncstopped = existing_header->syncstopped; + + std::list entries; + string start_obj; + string filter_prefix; + +#define CHECK_CHUNK_SIZE 1000 + bool more; + + do { + rc = list_plain_entries(hctx, filter_prefix, start_obj, CHECK_CHUNK_SIZE, &entries, &more); + if (rc < 0) { + return rc; + } + + for (const auto & bientry : entries) { + rgw_bucket_dir_entry entry; + auto diter = bientry.data.cbegin(); + try { + decode(entry, diter); + } catch (ceph::buffer::error& err) { + CLS_LOG(1, "ERROR:check_index(): failed to decode entry, key=%s", bientry.idx.c_str()); + return -EIO; + } + + if (entry.exists && entry.flags == 0) { + rgw_bucket_category_stats& stats = calc_header->stats[entry.meta.category]; + stats.num_entries++; + stats.total_size += entry.meta.accounted_size; + stats.total_size_rounded += cls_rgw_get_rounded_size(entry.meta.accounted_size); + stats.actual_size += entry.meta.size; + } + start_obj = bientry.idx; + } + entries.clear(); + } while (more); + + start_obj = ""; + do { + rc = list_instance_entries(hctx, filter_prefix, start_obj, CHECK_CHUNK_SIZE, &entries, &more); + if (rc < 0) { + return rc; + } + + for (const auto & bientry : entries) { + rgw_bucket_dir_entry entry; + auto diter = bientry.data.cbegin(); + try { + decode(entry, diter); + } catch (ceph::buffer::error& err) { + CLS_LOG(1, "ERROR:check_index(): failed to decode entry, key=%s", bientry.idx.c_str()); + return -EIO; + } + + if (entry.exists) { + rgw_bucket_category_stats& stats = calc_header->stats[entry.meta.category]; + stats.num_entries++; + stats.total_size += entry.meta.accounted_size; + stats.total_size_rounded += cls_rgw_get_rounded_size(entry.meta.accounted_size); + stats.actual_size += entry.meta.size; + } + start_obj = bientry.idx; + } + entries.clear(); + } while (more); + + return 0; +} + +int rgw_bucket_rebuild_index(cls_method_context_t hctx, bufferlist *in, bufferlist *out) +{ + CLS_LOG(10, "entered %s", __func__); + rgw_bucket_dir_header existing_header; + rgw_bucket_dir_header calc_header; + int rc = check_index(hctx, &existing_header, &calc_header); + if (rc < 0) + return rc; + + return write_bucket_header(hctx, &calc_header); +} + + +int rgw_bucket_check_index(cls_method_context_t hctx, bufferlist *in, bufferlist *out) +{ + CLS_LOG(10, "entered %s", __func__); + rgw_cls_check_index_ret ret; + + int rc = check_index(hctx, &ret.existing_header, &ret.calculated_header); + if (rc < 0) + return rc; + + encode(ret, *out); + + return 0; +} + + /* Lists all the entries that appear in a bucket index listing. * * It may not be obvious why this function calls three other "segment" diff --git a/ceph/src/cls/rgw/cls_rgw_types.cc b/ceph/src/cls/rgw/cls_rgw_types.cc index 4a982eccb..4e9392427 100644 --- a/ceph/src/cls/rgw/cls_rgw_types.cc +++ b/ceph/src/cls/rgw/cls_rgw_types.cc @@ -320,38 +320,29 @@ bool rgw_cls_bi_entry::get_info(cls_rgw_obj_key *key, RGWObjCategory *category, rgw_bucket_category_stats *accounted_stats) { - bool account = false; - auto iter = data.cbegin(); using ceph::decode; - switch (type) { - case BIIndexType::Plain: - account = true; - // NO BREAK; falls through to case InstanceIdx: - case BIIndexType::Instance: - { - rgw_bucket_dir_entry entry; - decode(entry, iter); - account = (account && entry.exists); - *key = entry.key; - *category = entry.meta.category; - accounted_stats->num_entries++; - accounted_stats->total_size += entry.meta.accounted_size; - accounted_stats->total_size_rounded += cls_rgw_get_rounded_size(entry.meta.accounted_size); - accounted_stats->actual_size += entry.meta.size; - } - break; - case BIIndexType::OLH: - { - rgw_bucket_olh_entry entry; - decode(entry, iter); - *key = entry.key; - } - break; - default: - break; + auto iter = data.cbegin(); + if (type == BIIndexType::OLH) { + rgw_bucket_olh_entry entry; + decode(entry, iter); + *key = entry.key; + return false; } - - return account; + + rgw_bucket_dir_entry entry; + decode(entry, iter); + *key = entry.key; + *category = entry.meta.category; + accounted_stats->num_entries++; + accounted_stats->total_size += entry.meta.accounted_size; + accounted_stats->total_size_rounded += cls_rgw_get_rounded_size(entry.meta.accounted_size); + accounted_stats->actual_size += entry.meta.size; + if (type == BIIndexType::Plain) { + return entry.exists && entry.flags == 0; + } else if (type == BIIndexType::Instance) { + return entry.exists; + } + return false; } void rgw_bucket_olh_entry::dump(Formatter *f) const diff --git a/ceph/src/common/bit_vector.hpp b/ceph/src/common/bit_vector.hpp index 9ce3e8b1e..961d9a019 100644 --- a/ceph/src/common/bit_vector.hpp +++ b/ceph/src/common/bit_vector.hpp @@ -83,7 +83,7 @@ public: }; public: - template + template class IteratorImpl { private: friend class BitVector; @@ -94,7 +94,7 @@ public: // cached derived values uint64_t m_index = 0; uint64_t m_shift = 0; - DataIterator m_data_iterator; + DataIteratorT m_data_iterator; IteratorImpl(BitVectorT *bit_vector, uint64_t offset) : m_bit_vector(bit_vector), @@ -129,7 +129,7 @@ public: inline IteratorImpl operator++(int) { IteratorImpl iterator_impl(*this); - ++iterator_impl; + ++*this; return iterator_impl; } inline IteratorImpl operator+(uint64_t offset) { @@ -145,17 +145,15 @@ public: return (m_offset != rhs.m_offset || m_bit_vector != rhs.m_bit_vector); } - inline ConstReference operator*() const { - return ConstReference(m_data_iterator, m_shift); - } - inline Reference operator*() { - return Reference(m_data_iterator, m_shift); + inline ReferenceT operator*() const { + return ReferenceT(m_data_iterator, m_shift); } }; typedef IteratorImpl ConstIterator; - typedef IteratorImpl Iterator; + bufferlist::const_iterator, + ConstReference> ConstIterator; + typedef IteratorImpl Iterator; static const uint32_t BLOCK_SIZE; static const uint8_t BIT_COUNT = _bit_count; diff --git a/ceph/src/common/intrusive_lru.h b/ceph/src/common/intrusive_lru.h index 422c24a14..2837f6798 100644 --- a/ceph/src/common/intrusive_lru.h +++ b/ceph/src/common/intrusive_lru.h @@ -166,6 +166,10 @@ public: evict(); } + ~intrusive_lru() { + set_target_size(0); + } + friend void intrusive_ptr_add_ref<>(intrusive_lru_base *); friend void intrusive_ptr_release<>(intrusive_lru_base *); }; diff --git a/ceph/src/common/legacy_config_opts.h b/ceph/src/common/legacy_config_opts.h index 07fc65744..0e3b4c8ef 100644 --- a/ceph/src/common/legacy_config_opts.h +++ b/ceph/src/common/legacy_config_opts.h @@ -144,7 +144,6 @@ OPTION(ms_blackhole_client, OPT_BOOL) OPTION(ms_dump_on_send, OPT_BOOL) // hexdump msg to log on send OPTION(ms_dump_corrupt_message_level, OPT_INT) // debug level to hexdump undecodeable messages at OPTION(ms_async_op_threads, OPT_U64) // number of worker processing threads for async messenger created on init -OPTION(ms_async_max_op_threads, OPT_U64) // max number of worker processing threads for async messenger OPTION(ms_async_rdma_device_name, OPT_STR) OPTION(ms_async_rdma_enable_hugepage, OPT_BOOL) OPTION(ms_async_rdma_buffer_size, OPT_INT) @@ -924,6 +923,8 @@ OPTION(bluefs_replay_recovery, OPT_BOOL) OPTION(bluefs_replay_recovery_disable_compact, OPT_BOOL) OPTION(bluefs_check_for_zeros, OPT_BOOL) +OPTION(bluefs_check_volume_selector_on_umount, OPT_BOOL) +OPTION(bluefs_check_volume_selector_often, OPT_BOOL) OPTION(bluestore_bluefs, OPT_BOOL) OPTION(bluestore_bluefs_env_mirror, OPT_BOOL) // mirror to normal Env for debug // how often (sec) to dump allocator on allocation failure diff --git a/ceph/src/common/options.cc b/ceph/src/common/options.cc index 546908b55..2c4f7bef4 100644 --- a/ceph/src/common/options.cc +++ b/ceph/src/common/options.cc @@ -597,6 +597,7 @@ std::vector