diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c index 4b5e65affb81..1878c83ff7e3 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c @@ -4794,6 +4794,20 @@ static int gfx_v10_0_sw_init(struct amdgpu_ip_block *ip_block) break; } switch (amdgpu_ip_version(adev, GC_HWIP, 0)) { + case IP_VERSION(10, 1, 10): + adev->gfx.cleaner_shader_ptr = gfx_10_1_10_cleaner_shader_hex; + adev->gfx.cleaner_shader_size = sizeof(gfx_10_1_10_cleaner_shader_hex); + if (adev->gfx.me_fw_version >= 101 && + adev->gfx.pfp_fw_version >= 158 && + adev->gfx.mec_fw_version >= 152) { + adev->gfx.enable_cleaner_shader = true; + r = amdgpu_gfx_cleaner_shader_sw_init(adev, adev->gfx.cleaner_shader_size); + if (r) { + adev->gfx.enable_cleaner_shader = false; + dev_err(adev->dev, "Failed to initialize cleaner shader\n"); + } + } + break; case IP_VERSION(10, 3, 0): case IP_VERSION(10, 3, 2): case IP_VERSION(10, 3, 4): diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0_cleaner_shader.h b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0_cleaner_shader.h index 663c2572d440..5255378af53c 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0_cleaner_shader.h +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0_cleaner_shader.h @@ -21,6 +21,41 @@ * OTHER DEALINGS IN THE SOFTWARE. */ +/* Define the cleaner shader gfx_10_1_10 */ +static const u32 gfx_10_1_10_cleaner_shader_hex[] = { + 0xb0804004, 0xbf8a0000, + 0xbf068100, 0xbf840023, + 0xbe8203b8, 0xbefc0380, + 0x7e008480, 0x7e028480, + 0x7e048480, 0x7e068480, + 0x7e088480, 0x7e0a8480, + 0x7e0c8480, 0x7e0e8480, + 0xbefc0302, 0x80828802, + 0xbf84fff5, 0xbe8203ff, + 0x80000000, 0x87020102, + 0xbf840012, 0xbefe03c1, + 0xbeff03c1, 0xd7650001, + 0x0001007f, 0xd7660001, + 0x0002027e, 0x16020288, + 0xbe8203bf, 0xbefc03c1, + 0xd9382000, 0x00020201, + 0xd9386040, 0x00040401, + 0xd70f6a01, 0x000202ff, + 0x00000400, 0x80828102, + 0xbf84fff7, 0xbefc03ff, + 0x00000068, 0xbe803080, + 0xbe813080, 0xbe823080, + 0xbe833080, 0x80fc847c, + 0xbf84fffa, 0xbeea0480, + 0xbeec0480, 0xbeee0480, + 0xbef00480, 0xbef20480, + 0xbef40480, 0xbef60480, + 0xbef80480, 0xbefa0480, + 0xbf810000, 0xbf9f0000, + 0xbf9f0000, 0xbf9f0000, + 0xbf9f0000, 0xbf9f0000, +}; + /* Define the cleaner shader gfx_10_3_0 */ static const u32 gfx_10_3_0_cleaner_shader_hex[] = { 0xb0804004, 0xbf8a0000, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_1_10_cleaner_shader.asm b/drivers/gpu/drm/amd/amdgpu/gfx_v10_1_10_cleaner_shader.asm new file mode 100644 index 000000000000..9ba3359253c9 --- /dev/null +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_1_10_cleaner_shader.asm @@ -0,0 +1,126 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright 2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +// This shader is to clean LDS, SGPRs and VGPRs. It is first 64 Dwords or 256 bytes of 256 Dwords cleaner shader. + +// GFX10.1 : Clear SGPRs, VGPRs and LDS +// Launch 32 waves per CU (16 per SIMD) as a workgroup (threadgroup) to fill every wave slot +// Waves are "wave32" and have 64 VGPRs each, which uses all 1024 VGPRs per SIMD +// Waves are launched in "CU" mode, and the workgroup shares 64KB of LDS (half of the WGP's LDS) +// It takes 2 workgroups to use all of LDS: one on each CU of the WGP +// Each wave clears SGPRs 0 - 107 +// Each wave clears VGPRs 0 - 63 +// The first wave of the workgroup clears its 64KB of LDS +// The shader starts with "S_BARRIER" to ensure SPI has launched all waves of the workgroup +// before any wave in the workgroup could end. Without this, it is possible not all SGPRs get cleared. + + +shader main + asic(GFX10.1) + type(CS) + wave_size(32) +// Note: original source code from SQ team + +// +// Create 32 waves in a threadgroup (CS waves) +// Each allocates 64 VGPRs +// The workgroup allocates all of LDS (64kbytes) +// +// Takes about 2500 clocks to run. +// (theorhetical fastest = 1024clks vgpr + 640lds = 1660 clks) +// + S_BARRIER + s_cmp_eq_u32 s0, 1 // Bit0 is set, sgpr0 is set then clear VGPRS and LDS as FW set COMPUTE_USER_DATA_0 + s_cbranch_scc0 label_0023 // Clean VGPRs and LDS if sgpr0 of wave is set, scc = (s0 == 1) + + s_mov_b32 s2, 0x00000038 // Loop 64/8=8 times (loop unrolled for performance) + s_mov_b32 m0, 0 + // + // CLEAR VGPRs + // +label_0005: + v_movreld_b32 v0, 0 + v_movreld_b32 v1, 0 + v_movreld_b32 v2, 0 + v_movreld_b32 v3, 0 + v_movreld_b32 v4, 0 + v_movreld_b32 v5, 0 + v_movreld_b32 v6, 0 + v_movreld_b32 v7, 0 + s_mov_b32 m0, s2 + s_sub_u32 s2, s2, 8 + s_cbranch_scc0 label_0005 + // + s_mov_b32 s2, 0x80000000 // Bit31 is first_wave + s_and_b32 s2, s2, s0 // sgpr0 has tg_size (first_wave) term as in ucode only COMPUTE_PGM_RSRC2.tg_size_en is set + s_cbranch_scc0 label_0023 // Clean LDS if its first wave of ThreadGroup/WorkGroup + // CLEAR LDS + // + s_mov_b32 exec_lo, 0xffffffff + s_mov_b32 exec_hi, 0xffffffff + v_mbcnt_lo_u32_b32 v1, exec_hi, 0 // Set V1 to thread-ID (0..63) + v_mbcnt_hi_u32_b32 v1, exec_lo, v1 // Set V1 to thread-ID (0..63) + v_mul_u32_u24 v1, 0x00000008, v1 // * 8, so each thread is a double-dword address (8byte) + s_mov_b32 s2, 0x00000003f // 64 loop iterations + s_mov_b32 m0, 0xffffffff + // Clear all of LDS space + // Each FirstWave of WorkGroup clears 64kbyte block + +label_001F: + ds_write2_b64 v1, v[2:3], v[2:3] offset1:32 + ds_write2_b64 v1, v[4:5], v[4:5] offset0:64 offset1:96 + v_add_co_u32 v1, vcc, 0x00000400, v1 + s_sub_u32 s2, s2, 1 + s_cbranch_scc0 label_001F + + // + // CLEAR SGPRs + // +label_0023: + s_mov_b32 m0, 0x00000068 // Loop 108/4=27 times (loop unrolled for performance) +label_sgpr_loop: + s_movreld_b32 s0, 0 + s_movreld_b32 s1, 0 + s_movreld_b32 s2, 0 + s_movreld_b32 s3, 0 + s_sub_u32 m0, m0, 4 + s_cbranch_scc0 label_sgpr_loop + + //clear vcc + s_mov_b64 vcc, 0 //clear vcc + //s_setreg_imm32_b32 hw_reg_shader_flat_scratch_lo, 0 //clear flat scratch lo SGPR + //s_setreg_imm32_b32 hw_reg_shader_flat_scratch_hi, 0 //clear flat scratch hi SGPR + s_mov_b64 ttmp0, 0 //Clear ttmp0 and ttmp1 + s_mov_b64 ttmp2, 0 //Clear ttmp2 and ttmp3 + s_mov_b64 ttmp4, 0 //Clear ttmp4 and ttmp5 + s_mov_b64 ttmp6, 0 //Clear ttmp6 and ttmp7 + s_mov_b64 ttmp8, 0 //Clear ttmp8 and ttmp9 + s_mov_b64 ttmp10, 0 //Clear ttmp10 and ttmp11 + s_mov_b64 ttmp12, 0 //Clear ttmp12 and ttmp13 + s_mov_b64 ttmp14, 0 //Clear ttmp14 and ttmp15 + + s_endpgm + +end + +