diff --git a/drivers/gpu/drm/xe/xe_engine.c b/drivers/gpu/drm/xe/xe_engine.c index 49d92f089242..1843e886a405 100644 --- a/drivers/gpu/drm/xe/xe_engine.c +++ b/drivers/gpu/drm/xe/xe_engine.c @@ -18,6 +18,7 @@ #include "xe_macros.h" #include "xe_migrate.h" #include "xe_pm.h" +#include "xe_ring_ops_types.h" #include "xe_trace.h" #include "xe_vm.h" @@ -679,6 +680,37 @@ static void engine_kill_compute(struct xe_engine *e) up_write(&e->vm->lock); } +/** + * xe_engine_is_lr() - Whether an engine is long-running + * @e: The engine + * + * Return: True if the engine is long-running, false otherwise. + */ +bool xe_engine_is_lr(struct xe_engine *e) +{ + return e->vm && xe_vm_no_dma_fences(e->vm) && + !(e->flags & ENGINE_FLAG_VM); +} + +static s32 xe_engine_num_job_inflight(struct xe_engine *e) +{ + return e->lrc->fence_ctx.next_seqno - xe_lrc_seqno(e->lrc) - 1; +} + +/** + * xe_engine_ring_full() - Whether an engine's ring is full + * @e: The engine + * + * Return: True if the engine's ring is full, false otherwise. + */ +bool xe_engine_ring_full(struct xe_engine *e) +{ + struct xe_lrc *lrc = e->lrc; + s32 max_job = lrc->ring.size / MAX_JOB_SIZE_BYTES; + + return xe_engine_num_job_inflight(e) >= max_job; +} + /** * xe_engine_is_idle() - Whether an engine is idle. * @engine: The engine diff --git a/drivers/gpu/drm/xe/xe_engine.h b/drivers/gpu/drm/xe/xe_engine.h index b95d9b040877..3017e4fe308d 100644 --- a/drivers/gpu/drm/xe/xe_engine.h +++ b/drivers/gpu/drm/xe/xe_engine.h @@ -42,6 +42,10 @@ static inline bool xe_engine_is_parallel(struct xe_engine *engine) return engine->width > 1; } +bool xe_engine_is_lr(struct xe_engine *e); + +bool xe_engine_ring_full(struct xe_engine *e); + bool xe_engine_is_idle(struct xe_engine *engine); void xe_engine_kill(struct xe_engine *e); diff --git a/drivers/gpu/drm/xe/xe_exec.c b/drivers/gpu/drm/xe/xe_exec.c index 4f7694a29348..700f65b66d40 100644 --- a/drivers/gpu/drm/xe/xe_exec.c +++ b/drivers/gpu/drm/xe/xe_exec.c @@ -14,6 +14,7 @@ #include "xe_device.h" #include "xe_engine.h" #include "xe_macros.h" +#include "xe_ring_ops_types.h" #include "xe_sched_job.h" #include "xe_sync.h" #include "xe_vm.h" @@ -302,6 +303,11 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file) goto err_engine_end; } + if (xe_engine_is_lr(engine) && xe_engine_ring_full(engine)) { + err = -EWOULDBLOCK; + goto err_engine_end; + } + job = xe_sched_job_create(engine, xe_engine_is_parallel(engine) ? addresses : &args->address); if (IS_ERR(job)) { @@ -388,6 +394,8 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file) xe_sync_entry_signal(&syncs[i], job, &job->drm.s_fence->finished); + if (xe_engine_is_lr(engine)) + engine->ring_ops->emit_job(job); xe_sched_job_push(job); xe_vm_reactivate_rebind(vm); diff --git a/drivers/gpu/drm/xe/xe_guc_engine_types.h b/drivers/gpu/drm/xe/xe_guc_engine_types.h index 512615d1ce8c..5565412fe7f1 100644 --- a/drivers/gpu/drm/xe/xe_guc_engine_types.h +++ b/drivers/gpu/drm/xe/xe_guc_engine_types.h @@ -31,6 +31,8 @@ struct xe_guc_engine { */ #define MAX_STATIC_MSG_TYPE 3 struct xe_sched_msg static_msgs[MAX_STATIC_MSG_TYPE]; + /** @lr_tdr: long running TDR worker */ + struct work_struct lr_tdr; /** @fini_async: do final fini async from this worker */ struct work_struct fini_async; /** @resume_time: time of last resume */ diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c index 7be06320dbd7..9c0fd1368b77 100644 --- a/drivers/gpu/drm/xe/xe_guc_submit.c +++ b/drivers/gpu/drm/xe/xe_guc_submit.c @@ -483,6 +483,14 @@ static void register_engine(struct xe_engine *e) parallel_write(xe, map, wq_desc.wq_status, WQ_STATUS_ACTIVE); } + /* + * We must keep a reference for LR engines if engine is registered with + * the GuC as jobs signal immediately and can't destroy an engine if the + * GuC has a reference to it. + */ + if (xe_engine_is_lr(e)) + xe_engine_get(e); + set_engine_registered(e); trace_xe_engine_register(e); if (xe_engine_is_parallel(e)) @@ -645,6 +653,7 @@ guc_engine_run_job(struct drm_sched_job *drm_job) { struct xe_sched_job *job = to_xe_sched_job(drm_job); struct xe_engine *e = job->engine; + bool lr = xe_engine_is_lr(e); XE_BUG_ON((engine_destroyed(e) || engine_pending_disable(e)) && !engine_banned(e) && !engine_suspended(e)); @@ -654,14 +663,19 @@ guc_engine_run_job(struct drm_sched_job *drm_job) if (!engine_killed_or_banned(e) && !xe_sched_job_is_error(job)) { if (!engine_registered(e)) register_engine(e); - e->ring_ops->emit_job(job); + if (!lr) /* LR jobs are emitted in the exec IOCTL */ + e->ring_ops->emit_job(job); submit_engine(e); } - if (test_and_set_bit(JOB_FLAG_SUBMIT, &job->fence->flags)) + if (lr) { + xe_sched_job_set_error(job, -EOPNOTSUPP); + return NULL; + } else if (test_and_set_bit(JOB_FLAG_SUBMIT, &job->fence->flags)) { return job->fence; - else + } else { return dma_fence_get(job->fence); + } } static void guc_engine_free_job(struct drm_sched_job *drm_job) @@ -764,6 +778,55 @@ static void simple_error_capture(struct xe_engine *e) } #endif +static void xe_guc_engine_trigger_cleanup(struct xe_engine *e) +{ + struct xe_guc *guc = engine_to_guc(e); + + if (xe_engine_is_lr(e)) + queue_work(guc_to_gt(guc)->ordered_wq, &e->guc->lr_tdr); + else + xe_sched_tdr_queue_imm(&e->guc->sched); +} + +static void xe_guc_engine_lr_cleanup(struct work_struct *w) +{ + struct xe_guc_engine *ge = + container_of(w, struct xe_guc_engine, lr_tdr); + struct xe_engine *e = ge->engine; + struct xe_gpu_scheduler *sched = &ge->sched; + + XE_WARN_ON(!xe_engine_is_lr(e)); + trace_xe_engine_lr_cleanup(e); + + /* Kill the run_job / process_msg entry points */ + xe_sched_submission_stop(sched); + + /* Engine state now stable, disable scheduling / deregister if needed */ + if (engine_registered(e)) { + struct xe_guc *guc = engine_to_guc(e); + int ret; + + set_engine_banned(e); + disable_scheduling_deregister(guc, e); + + /* + * Must wait for scheduling to be disabled before signalling + * any fences, if GT broken the GT reset code should signal us. + */ + ret = wait_event_timeout(guc->ct.wq, + !engine_pending_disable(e) || + guc_read_stopped(guc), HZ * 5); + if (!ret) { + XE_WARN_ON("Schedule disable failed to respond"); + xe_sched_submission_start(sched); + xe_gt_reset_async(e->gt); + return; + } + } + + xe_sched_submission_start(sched); +} + static enum drm_gpu_sched_stat guc_engine_timedout_job(struct drm_sched_job *drm_job) { @@ -815,7 +878,7 @@ guc_engine_timedout_job(struct drm_sched_job *drm_job) err = -EIO; set_engine_banned(e); xe_engine_get(e); - disable_scheduling_deregister(engine_to_guc(e), e); + disable_scheduling_deregister(guc, e); /* * Must wait for scheduling to be disabled before signalling @@ -848,7 +911,7 @@ guc_engine_timedout_job(struct drm_sched_job *drm_job) */ xe_sched_add_pending_job(sched, job); xe_sched_submission_start(sched); - xe_sched_tdr_queue_imm(&e->guc->sched); + xe_guc_engine_trigger_cleanup(e); /* Mark all outstanding jobs as bad, thus completing them */ spin_lock(&sched->base.job_list_lock); @@ -872,6 +935,8 @@ static void __guc_engine_fini_async(struct work_struct *w) trace_xe_engine_destroy(e); + if (xe_engine_is_lr(e)) + cancel_work_sync(&ge->lr_tdr); if (e->flags & ENGINE_FLAG_PERSISTENT) xe_device_remove_persistent_engines(gt_to_xe(e->gt), e); release_guc_id(guc, e); @@ -889,7 +954,7 @@ static void guc_engine_fini_async(struct xe_engine *e) bool kernel = e->flags & ENGINE_FLAG_KERNEL; INIT_WORK(&e->guc->fini_async, __guc_engine_fini_async); - queue_work(system_unbound_wq, &e->guc->fini_async); + queue_work(system_wq, &e->guc->fini_async); /* We must block on kernel engines so slabs are empty on driver unload */ if (kernel) { @@ -1080,6 +1145,9 @@ static int guc_engine_init(struct xe_engine *e) goto err_sched; e->priority = XE_ENGINE_PRIORITY_NORMAL; + if (xe_engine_is_lr(e)) + INIT_WORK(&e->guc->lr_tdr, xe_guc_engine_lr_cleanup); + mutex_lock(&guc->submission_state.lock); err = alloc_guc_id(guc, e); @@ -1131,7 +1199,7 @@ static void guc_engine_kill(struct xe_engine *e) { trace_xe_engine_kill(e); set_engine_killed(e); - xe_sched_tdr_queue_imm(&e->guc->sched); + xe_guc_engine_trigger_cleanup(e); } static void guc_engine_add_msg(struct xe_engine *e, struct xe_sched_msg *msg, @@ -1283,10 +1351,11 @@ static void guc_engine_stop(struct xe_guc *guc, struct xe_engine *e) xe_sched_submission_stop(sched); /* Clean up lost G2H + reset engine state */ - if (engine_destroyed(e) && engine_registered(e)) { - if (engine_banned(e)) + if (engine_registered(e)) { + if ((engine_banned(e) && engine_destroyed(e)) || + xe_engine_is_lr(e)) xe_engine_put(e); - else + else if (engine_destroyed(e)) __guc_engine_fini(guc, e); } if (e->guc->suspend_pending) { @@ -1501,7 +1570,8 @@ int xe_guc_deregister_done_handler(struct xe_guc *guc, u32 *msg, u32 len) trace_xe_engine_deregister_done(e); clear_engine_registered(e); - if (engine_banned(e)) + + if (engine_banned(e) || xe_engine_is_lr(e)) xe_engine_put(e); else __guc_engine_fini(guc, e); @@ -1538,7 +1608,7 @@ int xe_guc_engine_reset_handler(struct xe_guc *guc, u32 *msg, u32 len) */ set_engine_reset(e); if (!engine_banned(e)) - xe_sched_tdr_queue_imm(&e->guc->sched); + xe_guc_engine_trigger_cleanup(e); return 0; } @@ -1565,7 +1635,7 @@ int xe_guc_engine_memory_cat_error_handler(struct xe_guc *guc, u32 *msg, /* Treat the same as engine reset */ set_engine_reset(e); if (!engine_banned(e)) - xe_sched_tdr_queue_imm(&e->guc->sched); + xe_guc_engine_trigger_cleanup(e); return 0; } diff --git a/drivers/gpu/drm/xe/xe_trace.h b/drivers/gpu/drm/xe/xe_trace.h index 878ab4115d91..8a5d35f15791 100644 --- a/drivers/gpu/drm/xe/xe_trace.h +++ b/drivers/gpu/drm/xe/xe_trace.h @@ -220,6 +220,11 @@ DEFINE_EVENT(xe_engine, xe_engine_resubmit, TP_ARGS(e) ); +DEFINE_EVENT(xe_engine, xe_engine_lr_cleanup, + TP_PROTO(struct xe_engine *e), + TP_ARGS(e) +); + DECLARE_EVENT_CLASS(xe_sched_job, TP_PROTO(struct xe_sched_job *job), TP_ARGS(job),