diff --git a/drivers/gpu/drm/xe/xe_eu_stall.c b/drivers/gpu/drm/xe/xe_eu_stall.c index 2e7d00f8ff40..e82a6ebfe979 100644 --- a/drivers/gpu/drm/xe/xe_eu_stall.c +++ b/drivers/gpu/drm/xe/xe_eu_stall.c @@ -21,10 +21,13 @@ #include "xe_macros.h" #include "xe_observation.h" #include "xe_pm.h" +#include "xe_trace.h" #include "regs/xe_eu_stall_regs.h" #include "regs/xe_gt_regs.h" +#define POLL_PERIOD_MS 5 + static size_t per_xecore_buf_size = SZ_512K; struct per_xecore_buf { @@ -37,15 +40,18 @@ struct per_xecore_buf { }; struct xe_eu_stall_data_stream { + bool pollin; bool enabled; int wait_num_reports; int sampling_rate_mult; + wait_queue_head_t poll_wq; size_t data_record_size; size_t per_xecore_buf_size; struct xe_gt *gt; struct xe_bo *bo; struct per_xecore_buf *xecore_buf; + struct delayed_work buf_poll_work; }; struct xe_eu_stall_gt { @@ -53,6 +59,8 @@ struct xe_eu_stall_gt { struct mutex stream_lock; /* EU stall data stream */ struct xe_eu_stall_data_stream *stream; + /* Workqueue to schedule buffer pointers polling work */ + struct workqueue_struct *buf_ptr_poll_wq; }; /** @@ -114,6 +122,7 @@ static void xe_eu_stall_fini(void *arg) { struct xe_gt *gt = arg; + destroy_workqueue(gt->eu_stall->buf_ptr_poll_wq); mutex_destroy(>->eu_stall->stream_lock); kfree(gt->eu_stall); } @@ -139,11 +148,19 @@ int xe_eu_stall_init(struct xe_gt *gt) mutex_init(>->eu_stall->stream_lock); + gt->eu_stall->buf_ptr_poll_wq = alloc_ordered_workqueue("xe_eu_stall", 0); + if (!gt->eu_stall->buf_ptr_poll_wq) { + ret = -ENOMEM; + goto exit_free; + } + ret = devm_add_action_or_reset(xe->drm.dev, xe_eu_stall_fini, gt); if (ret) - goto exit_free; + goto exit_destroy; return 0; +exit_destroy: + destroy_workqueue(gt->eu_stall->buf_ptr_poll_wq); exit_free: mutex_destroy(>->eu_stall->stream_lock); kfree(gt->eu_stall); @@ -248,6 +265,172 @@ static int xe_eu_stall_user_extensions(struct xe_device *xe, u64 extension, return 0; } +/** + * buf_data_size - Calculate the number of bytes in a circular buffer + * given the read and write pointers and the size of + * the buffer. + * + * @buf_size: Size of the circular buffer + * @read_ptr: Read pointer with an additional overflow bit + * @write_ptr: Write pointer with an additional overflow bit + * + * Since the read and write pointers have an additional overflow bit, + * this function calculates the offsets from the pointers and use the + * offsets to calculate the data size in the buffer. + * + * Returns: number of bytes of data in the buffer + */ +static u32 buf_data_size(size_t buf_size, u32 read_ptr, u32 write_ptr) +{ + u32 read_offset, write_offset, size = 0; + + if (read_ptr == write_ptr) + goto exit; + + read_offset = read_ptr & (buf_size - 1); + write_offset = write_ptr & (buf_size - 1); + + if (write_offset > read_offset) + size = write_offset - read_offset; + else + size = buf_size - read_offset + write_offset; +exit: + return size; +} + +/** + * eu_stall_data_buf_poll - Poll for EU stall data in the buffer. + * + * @stream: xe EU stall data stream instance + * + * Returns: true if the EU stall buffer contains minimum stall data as + * specified by the event report count, else false. + */ +static bool eu_stall_data_buf_poll(struct xe_eu_stall_data_stream *stream) +{ + u32 read_ptr, write_ptr_reg, write_ptr, total_data = 0; + u32 buf_size = stream->per_xecore_buf_size; + struct per_xecore_buf *xecore_buf; + struct xe_gt *gt = stream->gt; + bool min_data_present = false; + u16 group, instance; + unsigned int xecore; + + mutex_lock(>->eu_stall->stream_lock); + for_each_dss_steering(xecore, gt, group, instance) { + xecore_buf = &stream->xecore_buf[xecore]; + read_ptr = xecore_buf->read; + write_ptr_reg = xe_gt_mcr_unicast_read(gt, XEHPC_EUSTALL_REPORT, + group, instance); + write_ptr = REG_FIELD_GET(XEHPC_EUSTALL_REPORT_WRITE_PTR_MASK, write_ptr_reg); + write_ptr <<= 6; + write_ptr &= ((buf_size << 1) - 1); + if (!min_data_present) { + total_data += buf_data_size(buf_size, read_ptr, write_ptr); + if (num_data_rows(total_data) >= stream->wait_num_reports) + min_data_present = true; + } + xecore_buf->write = write_ptr; + } + mutex_unlock(>->eu_stall->stream_lock); + + return min_data_present; +} + +static int xe_eu_stall_data_buf_read(struct xe_eu_stall_data_stream *stream, + char __user *buf, size_t count, + size_t *total_data_size, struct xe_gt *gt, + u16 group, u16 instance, unsigned int xecore) +{ + size_t read_data_size, copy_size, buf_size; + u32 read_ptr_reg, read_ptr, write_ptr; + u8 *xecore_start_vaddr, *read_vaddr; + struct per_xecore_buf *xecore_buf; + u32 read_offset, write_offset; + + /* Hardware increments the read and write pointers such that they can + * overflow into one additional bit. For example, a 256KB size buffer + * offset pointer needs 18 bits. But HW uses 19 bits for the read and + * write pointers. This technique avoids wasting a slot in the buffer. + * Read and write offsets are calculated from the pointers in order to + * check if the write pointer has wrapped around the array. + */ + xecore_buf = &stream->xecore_buf[xecore]; + xecore_start_vaddr = xecore_buf->vaddr; + read_ptr = xecore_buf->read; + write_ptr = xecore_buf->write; + buf_size = stream->per_xecore_buf_size; + + read_data_size = buf_data_size(buf_size, read_ptr, write_ptr); + /* Read only the data that the user space buffer can accommodate */ + read_data_size = min_t(size_t, count - *total_data_size, read_data_size); + if (read_data_size == 0) + return 0; + + read_offset = read_ptr & (buf_size - 1); + write_offset = write_ptr & (buf_size - 1); + read_vaddr = xecore_start_vaddr + read_offset; + + if (write_offset > read_offset) { + if (copy_to_user(buf + *total_data_size, read_vaddr, read_data_size)) + return -EFAULT; + } else { + if (read_data_size >= buf_size - read_offset) + copy_size = buf_size - read_offset; + else + copy_size = read_data_size; + if (copy_to_user(buf + *total_data_size, read_vaddr, copy_size)) + return -EFAULT; + if (copy_to_user(buf + *total_data_size + copy_size, + xecore_start_vaddr, read_data_size - copy_size)) + return -EFAULT; + } + + *total_data_size += read_data_size; + read_ptr += read_data_size; + + /* Read pointer can overflow into one additional bit */ + read_ptr &= (buf_size << 1) - 1; + read_ptr_reg = REG_FIELD_PREP(XEHPC_EUSTALL_REPORT1_READ_PTR_MASK, (read_ptr >> 6)); + read_ptr_reg = _MASKED_FIELD(XEHPC_EUSTALL_REPORT1_READ_PTR_MASK, read_ptr_reg); + xe_gt_mcr_unicast_write(gt, XEHPC_EUSTALL_REPORT1, read_ptr_reg, group, instance); + xecore_buf->read = read_ptr; + trace_xe_eu_stall_data_read(group, instance, read_ptr, write_ptr, + read_data_size, *total_data_size); + return 0; +} + +/** + * xe_eu_stall_stream_read_locked - copy EU stall counters data from the + * per xecore buffers to the userspace buffer + * @stream: A stream opened for EU stall count metrics + * @file: An xe EU stall data stream file + * @buf: destination buffer given by userspace + * @count: the number of bytes userspace wants to read + * + * Returns: Number of bytes copied or a negative error code + * If we've successfully copied any data then reporting that takes + * precedence over any internal error status, so the data isn't lost. + */ +static ssize_t xe_eu_stall_stream_read_locked(struct xe_eu_stall_data_stream *stream, + struct file *file, char __user *buf, + size_t count) +{ + struct xe_gt *gt = stream->gt; + size_t total_size = 0; + u16 group, instance; + unsigned int xecore; + int ret = 0; + + for_each_dss_steering(xecore, gt, group, instance) { + ret = xe_eu_stall_data_buf_read(stream, buf, count, &total_size, + gt, group, instance, xecore); + if (ret || count == total_size) + break; + } + return total_size ?: (ret ?: -EAGAIN); +} + /* * Userspace must enable the EU stall stream with DRM_XE_OBSERVATION_IOCTL_ENABLE * before calling read(). @@ -255,7 +438,41 @@ static int xe_eu_stall_user_extensions(struct xe_device *xe, u64 extension, static ssize_t xe_eu_stall_stream_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { - ssize_t ret = 0; + struct xe_eu_stall_data_stream *stream = file->private_data; + struct xe_gt *gt = stream->gt; + ssize_t ret, aligned_count; + + aligned_count = ALIGN_DOWN(count, stream->data_record_size); + if (aligned_count == 0) + return -EINVAL; + + if (!stream->enabled) { + xe_gt_dbg(gt, "EU stall data stream not enabled to read\n"); + return -EINVAL; + } + + if (!(file->f_flags & O_NONBLOCK)) { + do { + ret = wait_event_interruptible(stream->poll_wq, stream->pollin); + if (ret) + return -EINTR; + + mutex_lock(>->eu_stall->stream_lock); + ret = xe_eu_stall_stream_read_locked(stream, file, buf, aligned_count); + mutex_unlock(>->eu_stall->stream_lock); + } while (ret == -EAGAIN); + } else { + mutex_lock(>->eu_stall->stream_lock); + ret = xe_eu_stall_stream_read_locked(stream, file, buf, aligned_count); + mutex_unlock(>->eu_stall->stream_lock); + } + + /* + * This may not work correctly if the user buffer is very small. + * We don't want to block the next read() when there is data in the buffer + * now, but couldn't be accommodated in the small user buffer. + */ + stream->pollin = false; return ret; } @@ -348,6 +565,21 @@ static int xe_eu_stall_stream_enable(struct xe_eu_stall_data_stream *stream) return 0; } +static void eu_stall_data_buf_poll_work_fn(struct work_struct *work) +{ + struct xe_eu_stall_data_stream *stream = + container_of(work, typeof(*stream), buf_poll_work.work); + struct xe_gt *gt = stream->gt; + + if (eu_stall_data_buf_poll(stream)) { + stream->pollin = true; + wake_up(&stream->poll_wq); + } + queue_delayed_work(gt->eu_stall->buf_ptr_poll_wq, + &stream->buf_poll_work, + msecs_to_jiffies(POLL_PERIOD_MS)); +} + static int xe_eu_stall_stream_init(struct xe_eu_stall_data_stream *stream, struct eu_stall_open_properties *props) { @@ -372,6 +604,9 @@ static int xe_eu_stall_stream_init(struct xe_eu_stall_data_stream *stream, max_wait_num_reports); return -EINVAL; } + + init_waitqueue_head(&stream->poll_wq); + INIT_DELAYED_WORK(&stream->buf_poll_work, eu_stall_data_buf_poll_work_fn); stream->per_xecore_buf_size = per_xecore_buf_size; stream->sampling_rate_mult = props->sampling_rate_mult; stream->wait_num_reports = props->wait_num_reports; @@ -389,15 +624,35 @@ static int xe_eu_stall_stream_init(struct xe_eu_stall_data_stream *stream, return 0; } +static __poll_t xe_eu_stall_stream_poll_locked(struct xe_eu_stall_data_stream *stream, + struct file *file, poll_table *wait) +{ + __poll_t events = 0; + + poll_wait(file, &stream->poll_wq, wait); + + if (stream->pollin) + events |= EPOLLIN; + + return events; +} + static __poll_t xe_eu_stall_stream_poll(struct file *file, poll_table *wait) { - __poll_t ret = 0; + struct xe_eu_stall_data_stream *stream = file->private_data; + struct xe_gt *gt = stream->gt; + __poll_t ret; + + mutex_lock(>->eu_stall->stream_lock); + ret = xe_eu_stall_stream_poll_locked(stream, file, wait); + mutex_unlock(>->eu_stall->stream_lock); return ret; } static int xe_eu_stall_enable_locked(struct xe_eu_stall_data_stream *stream) { + struct xe_gt *gt = stream->gt; int ret = 0; if (stream->enabled) @@ -406,6 +661,10 @@ static int xe_eu_stall_enable_locked(struct xe_eu_stall_data_stream *stream) stream->enabled = true; ret = xe_eu_stall_stream_enable(stream); + + queue_delayed_work(gt->eu_stall->buf_ptr_poll_wq, + &stream->buf_poll_work, + msecs_to_jiffies(POLL_PERIOD_MS)); return ret; } @@ -420,6 +679,8 @@ static int xe_eu_stall_disable_locked(struct xe_eu_stall_data_stream *stream) xe_gt_mcr_multicast_write(gt, XEHPC_EUSTALL_BASE, 0); + cancel_delayed_work_sync(&stream->buf_poll_work); + xe_force_wake_put(gt_to_fw(gt), XE_FW_RENDER); xe_pm_runtime_put(gt_to_xe(gt)); diff --git a/drivers/gpu/drm/xe/xe_trace.h b/drivers/gpu/drm/xe/xe_trace.h index d5281de04d54..b4a3577df70c 100644 --- a/drivers/gpu/drm/xe/xe_trace.h +++ b/drivers/gpu/drm/xe/xe_trace.h @@ -427,6 +427,36 @@ DEFINE_EVENT(xe_pm_runtime, xe_pm_runtime_get_ioctl, TP_ARGS(xe, caller) ); +TRACE_EVENT(xe_eu_stall_data_read, + TP_PROTO(u8 slice, u8 subslice, + u32 read_ptr, u32 write_ptr, + size_t read_size, size_t total_size), + TP_ARGS(slice, subslice, + read_ptr, write_ptr, + read_size, total_size), + + TP_STRUCT__entry(__field(u8, slice) + __field(u8, subslice) + __field(u32, read_ptr) + __field(u32, write_ptr) + __field(size_t, read_size) + __field(size_t, total_size) + ), + + TP_fast_assign(__entry->slice = slice; + __entry->subslice = subslice; + __entry->read_ptr = read_ptr; + __entry->write_ptr = write_ptr; + __entry->read_size = read_size; + __entry->total_size = total_size; + ), + + TP_printk("slice: %u subslice: %u read ptr: 0x%x write ptr: 0x%x read size: %zu total read size: %zu", + __entry->slice, __entry->subslice, + __entry->read_ptr, __entry->write_ptr, + __entry->read_size, __entry->total_size) +); + #endif /* This part must be outside protection */