From bb912786bdbc65b08aeb5ae508f2d481ce4b2bf6 Mon Sep 17 00:00:00 2001
From: loki-47-6F-64 <loki-47-6F-64@users.noreply.github.com>
Date: Wed, 22 Sep 2021 11:36:59 +0200
Subject: [PATCH] Added NvFBC on Linux

---
 sunshine/platform/common.h          |   7 +
 sunshine/platform/linux/cuda.cpp    | 703 +++++++++++++++++-----------
 sunshine/platform/linux/cuda.cu     |  92 ++--
 sunshine/platform/linux/cuda.h      |  36 +-
 sunshine/platform/linux/misc.cpp    |  29 +-
 sunshine/platform/linux/x11grab.cpp |   4 +-
 sunshine/video.cpp                  |   2 +-
 7 files changed, 546 insertions(+), 327 deletions(-)
diff --git a/sunshine/platform/common.h b/sunshine/platform/common.h
index 7482f0ea..4b702a9b 100644
--- a/sunshine/platform/common.h
+++ b/sunshine/platform/common.h
@@ -141,6 +141,13 @@ public:
 
 struct img_t {
 public:
+  img_t() = default;
+
+  img_t(img_t &&)      = delete;
+  img_t(const img_t &) = delete;
+  img_t &operator=(img_t &&) = delete;
+  img_t &operator=(const img_t &) = delete;
+
   std::uint8_t *data {};
   std::int32_t width {};
   std::int32_t height {};
diff --git a/sunshine/platform/linux/cuda.cpp b/sunshine/platform/linux/cuda.cpp
index e57f914d..25d88518 100644
--- a/sunshine/platform/linux/cuda.cpp
+++ b/sunshine/platform/linux/cuda.cpp
@@ -1,3 +1,5 @@
+#include <bitset>
+
 #include <NvFBC.h>
 #include <ffnvcodec/dynlink_loader.h>
 
@@ -55,96 +57,9 @@ inline static int check(CUresult result, const std::string_view &sv) {
   return 0;
 }
 
-class ctx_t {
+class img_t : public platf::img_t {
 public:
-  ctx_t(CUcontext ctx) {
-    CU_CHECK_IGNORE(cdf->cuCtxPushCurrent(ctx), "Couldn't push cuda context");
-  }
-
-  ~ctx_t() {
-    CUcontext dummy;
-
-    CU_CHECK_IGNORE(cdf->cuCtxPopCurrent(&dummy), "Couldn't pop cuda context");
-  }
-};
-
-void free_res(CUgraphicsResource res) {
-  cdf->cuGraphicsUnregisterResource(res);
-}
-
-using res_internal_t = util::safe_ptr<CUgraphicsResource_st, free_res>;
-
-template<std::size_t N>
-class res_t {
-public:
-  res_t() : resources {}, mapped { false } {}
-  res_t(res_t &&other) noexcept : resources { other.resources }, array_p { other.array_p }, ctx { other.ctx }, stream { other.stream } {
-    other.resources = std::array<res_internal_t::pointer, N> {};
-  }
-
-  res_t &operator=(res_t &&other) {
-    for(auto x = 0; x < N; ++x) {
-      std::swap(resources[x], other.resources[x]);
-      std::swap(array_p[x], other.array_p[x]);
-    }
-
-    std::swap(ctx, other.ctx);
-    std::swap(stream, other.stream);
-    std::swap(mapped, other.mapped);
-
-    return *this;
-  }
-
-  res_t(CUcontext ctx, CUstream stream) : resources {}, ctx { ctx }, stream { stream }, mapped { false } {}
-
-  int bind(gl::tex_t &tex) {
-    ctx_t ctx { this->ctx };
-
-    CU_CHECK(cdf->cuGraphicsGLRegisterImage(&resources[0], tex[0], GL_TEXTURE_2D, CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY), "Couldn't register Y image");
-    CU_CHECK(cdf->cuGraphicsGLRegisterImage(&resources[1], tex[1], GL_TEXTURE_2D, CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY), "Couldn't register uv image");
-
-    return 0;
-  }
-
-  int map() {
-    ctx_t ctx { this->ctx };
-
-    CU_CHECK(cdf->cuGraphicsMapResources(resources.size(), resources.data(), stream), "Coudn't map cuda resources");
-
-    mapped = true;
-
-    CU_CHECK(cdf->cuGraphicsSubResourceGetMappedArray(&array_p[0], resources[0], 0, 0), "Couldn't get mapped subresource [0]");
-    CU_CHECK(cdf->cuGraphicsSubResourceGetMappedArray(&array_p[1], resources[1], 0, 0), "Couldn't get mapped subresource [1]");
-
-    return 0;
-  }
-
-  void unmap() {
-    // Either all or none are mapped
-    if(mapped) {
-      ctx_t ctx { this->ctx };
-
-      CU_CHECK_IGNORE(cdf->cuGraphicsUnmapResources(resources.size(), resources.data(), stream), "Couldn't unmap cuda resources");
-
-      mapped = false;
-    }
-  }
-
-  inline CUarray &operator[](std::size_t index) {
-    return array_p[index];
-  }
-
-  ~res_t() {
-    unmap();
-  }
-
-  std::array<res_internal_t::pointer, N> resources;
-  std::array<CUarray, N> array_p;
-
-  CUcontext ctx;
-  CUstream stream;
-
-  bool mapped;
+  tex_t tex;
 };
 
 int init() {
@@ -160,139 +75,8 @@ int init() {
   return 0;
 }
 
-class opengl_t : public platf::hwdevice_t {
-public:
-  int init(int in_width, int in_height, platf::x11::xdisplay_t::pointer xdisplay) {
-    if(!cdf) {
-      BOOST_LOG(warning) << "cuda not initialized"sv;
-      return -1;
-    }
-
-    this->data = (void *)0x1;
-
-    display = egl::make_display(xdisplay);
-    if(!display) {
-      return -1;
-    }
-
-    auto ctx_opt = egl::make_ctx(display.get());
-    if(!ctx_opt) {
-      return -1;
-    }
-
-    ctx = std::move(*ctx_opt);
-
-    width  = in_width;
-    height = in_height;
-
-    return 0;
-  }
-
-  int set_frame(AVFrame *frame) override {
-    auto cuda_ctx = (AVCUDADeviceContext *)((AVHWFramesContext *)frame->hw_frames_ctx->data)->device_ctx->hwctx;
-
-    tex = gl::tex_t::make(2);
-    fb  = gl::frame_buf_t::make(2);
-
-    gl::ctx.BindTexture(GL_TEXTURE_2D, tex[0]);
-    gl::ctx.TexImage2D(GL_TEXTURE_2D, 0, GL_RED, frame->width, frame->height, 0, GL_RED, GL_UNSIGNED_BYTE, nullptr);
-    gl::ctx.BindTexture(GL_TEXTURE_2D, tex[1]);
-    gl::ctx.TexImage2D(GL_TEXTURE_2D, 0, GL_RG, frame->width / 2, frame->height / 2, 0, GL_RG, GL_UNSIGNED_BYTE, nullptr);
-    gl::ctx.BindTexture(GL_TEXTURE_2D, 0);
-
-    fb.bind(std::begin(tex), std::end(tex));
-
-    res = res_t<2> { cuda_ctx->cuda_ctx, cuda_ctx->stream };
-
-    if(res.bind(tex)) {
-      return -1;
-    }
-
-    this->hwframe.reset(frame);
-    this->frame = frame;
-
-    if(av_hwframe_get_buffer(frame->hw_frames_ctx, frame, 0)) {
-      BOOST_LOG(error) << "Couldn't get hwframe for NVENC"sv;
-
-      return -1;
-    }
-
-    auto sws_opt = egl::sws_t::make(width, height, frame->width, frame->height);
-    if(!sws_opt) {
-      return -1;
-    }
-
-    sws = std::move(*sws_opt);
-    return sws.blank(fb, 0, 0, frame->width, frame->height);
-  }
-
-  int convert(platf::img_t &img) override {
-    sws.load_ram(img);
-
-    if(sws.convert(fb)) {
-      return -1;
-    }
-
-    if(res.map()) {
-      return -1;
-    }
-
-    // Push and pop cuda context
-    ctx_t ctx { res.ctx };
-    for(auto x = 0; x < 2; ++x) {
-      CUDA_MEMCPY2D desc {};
-
-      auto shift = x;
-
-      desc.srcPitch     = frame->width;
-      desc.dstPitch     = frame->linesize[x];
-      desc.Height       = frame->height >> shift;
-      desc.WidthInBytes = std::min(desc.srcPitch, desc.dstPitch);
-
-      desc.srcMemoryType = CU_MEMORYTYPE_ARRAY;
-      desc.dstMemoryType = CU_MEMORYTYPE_DEVICE;
-
-      desc.srcArray  = res[x];
-      desc.dstDevice = (CUdeviceptr)frame->data[x];
-
-      CU_CHECK(cdf->cuMemcpy2DAsync(&desc, res.stream), "Couldn't copy from OpenGL to cuda");
-    }
-
-    res.unmap();
-
-    return 0;
-  }
-
-  void set_colorspace(std::uint32_t colorspace, std::uint32_t color_range) override {
-    sws.set_colorspace(colorspace, color_range);
-  }
-
-  frame_t hwframe;
-
-  egl::display_t display;
-  egl::ctx_t ctx;
-
-  egl::sws_t sws;
-
-  gl::tex_t tex;
-  gl::frame_buf_t fb;
-
-  res_t<2> res;
-
-  int width, height;
-};
-
 class cuda_t : public platf::hwdevice_t {
 public:
-  ~cuda_t() override {
-    // sws_t needs to be destroyed while the context is active
-    if(sws) {
-      ctx_t ctx { cuda_ctx };
-
-      sws.reset();
-    }
-  }
-
   int init(int in_width, int in_height) {
     if(!cdf) {
       BOOST_LOG(warning) << "cuda not initialized"sv;
@@ -322,78 +106,111 @@ public:
       return -1;
     }
 
-    cuda_ctx = ((AVCUDADeviceContext *)((AVHWFramesContext *)frame->hw_frames_ctx->data)->device_ctx->hwctx)->cuda_ctx;
-
-    ctx_t ctx { cuda_ctx };
-    sws = sws_t::make(width, height, frame->width, frame->height, width * 4);
-
-    if(!sws) {
+    auto sws_opt = sws_t::make(width, height, frame->width, frame->height, width * 4);
+    if(!sws_opt) {
       return -1;
     }
 
+    sws = std::move(*sws_opt);
+
     return 0;
   }
 
-  int convert(platf::img_t &img) override {
-    ctx_t ctx { cuda_ctx };
-
-    return sws->load_ram(img) || sws->convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1]);
-  }
-
   void set_colorspace(std::uint32_t colorspace, std::uint32_t color_range) override {
-    ctx_t ctx { cuda_ctx };
-    sws->set_colorspace(colorspace, color_range);
+    sws.set_colorspace(colorspace, color_range);
 
+    auto tex = tex_t::make(height, width * 4);
+    if(!tex) {
+      return;
+    }
 
     // The default green color is ugly.
     // Update the background color
     platf::img_t img;
-    img.width = frame->width;
-    img.height = frame->height;
+    img.width       = width;
+    img.height      = height;
     img.pixel_pitch = 4;
-    img.row_pitch = img.width * img.pixel_pitch;
+    img.row_pitch   = img.width * img.pixel_pitch;
 
     std::vector<std::uint8_t> image_data;
     image_data.resize(img.row_pitch * img.height);
 
     img.data = image_data.data();
 
-    if(sws->load_ram(img)) {
+    if(sws.load_ram(img, tex->array)) {
       return;
     }
 
-    sws->convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1], {
-      frame->width, frame->height, 0, 0
-    });
+    sws.convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1], tex->texture, { frame->width, frame->height, 0, 0 });
   }
 
   frame_t hwframe;
 
-  std::unique_ptr<sws_t> sws;
-
   int width, height;
 
-  CUcontext cuda_ctx;
+  sws_t sws;
 };
 
-std::shared_ptr<platf::hwdevice_t> make_hwdevice(int width, int height, platf::x11::xdisplay_t::pointer xdisplay) {
+class cuda_ram_t : public cuda_t {
+public:
+  int convert(platf::img_t &img) override {
+    return sws.load_ram(img, tex.array) || sws.convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1], tex.texture);
+  }
+
+  int set_frame(AVFrame *frame) {
+    if(cuda_t::set_frame(frame)) {
+      return -1;
+    }
+
+    auto tex_opt = tex_t::make(height, width * 4);
+    if(!tex_opt) {
+      return -1;
+    }
+
+    tex = std::move(*tex_opt);
+
+    return 0;
+  }
+
+  tex_t tex;
+};
+
+class cuda_vram_t : public cuda_t {
+public:
+  int convert(platf::img_t &img) override {
+    return sws.convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1], (cudaTextureObject_t)img.data);
+  }
+};
+
+std::shared_ptr<platf::hwdevice_t> make_hwdevice(int width, int height, bool vram) {
   if(init()) {
     return nullptr;
   }
 
-  auto cuda = std::make_shared<cuda_t>();
+  std::shared_ptr<cuda_t> cuda;
+
+  if(vram) {
+    cuda = std::make_shared<cuda_vram_t>();
+  }
+  else {
+    cuda = std::make_shared<cuda_ram_t>();
+  }
+
   if(cuda->init(width, height)) {
     return nullptr;
   }
 
   return cuda;
 }
-} // namespace cuda
 
-namespace platf::nvfbc {
+namespace nvfbc {
 static PNVFBCCREATEINSTANCE createInstance {};
 static NVFBC_API_FUNCTION_LIST func { NVFBC_VERSION };
 
+static constexpr inline NVFBC_BOOL nv_bool(bool b) {
+  return b ? NVFBC_TRUE : NVFBC_FALSE;
+}
+
 static void *handle { nullptr };
 int init() {
   static bool funcs_loaded = false;
@@ -418,49 +235,65 @@ int init() {
     return -1;
   }
 
+  auto status = cuda::nvfbc::createInstance(&cuda::nvfbc::func);
+  if(status) {
+    BOOST_LOG(error) << "Unable to create NvFBC instance"sv;
+
+    dlclose(handle);
+    handle = nullptr;
+    return -1;
+  }
+
   funcs_loaded = true;
   return 0;
 }
 
 class handle_t {
-  KITTY_USING_MOVE_T(session_t, NVFBC_SESSION_HANDLE, std::numeric_limits<std::uint64_t>::max(), {
-    if(el == std::numeric_limits<std::uint64_t>::max()) {
-      return;
-    }
-    NVFBC_DESTROY_HANDLE_PARAMS params { NVFBC_DESTROY_HANDLE_PARAMS_VER };
-
-    auto status = func.nvFBCDestroyHandle(el, &params);
-    if(status) {
-      BOOST_LOG(error) << "Failed to destroy nvfbc handle: "sv << func.nvFBCGetLastErrorStr(el);
-    }
-  });
+  enum flag_e {
+    SESSION_HANDLE,
+    SESSION_CAPTURE,
+    MAX_FLAGS,
+  };
 
 public:
+  handle_t() = default;
+  handle_t(handle_t &&other) : handle_flags { other.handle_flags }, handle { other.handle } {
+    other.handle_flags.reset();
+  }
+
+  handle_t &operator=(handle_t &&other) {
+    std::swap(handle_flags, other.handle_flags);
+    std::swap(handle, other.handle);
+
+    return *this;
+  }
+
   static std::optional<handle_t> make() {
     NVFBC_CREATE_HANDLE_PARAMS params { NVFBC_CREATE_HANDLE_PARAMS_VER };
-    session_t session;
 
-    auto status = func.nvFBCCreateHandle(&session.el, &params);
+    handle_t handle;
+    auto status = func.nvFBCCreateHandle(&handle.handle, &params);
     if(status) {
-      BOOST_LOG(error) << "Failed to create session: "sv << func.nvFBCGetLastErrorStr(session.el);
-      session.release();
+      BOOST_LOG(error) << "Failed to create session: "sv << handle.last_error();
 
       return std::nullopt;
     }
 
-    return handle_t { std::move(session) };
+    handle.handle_flags[SESSION_HANDLE] = true;
+
+    return std::move(handle);
   }
 
   const char *last_error() {
-    return func.nvFBCGetLastErrorStr(session.el);
+    return func.nvFBCGetLastErrorStr(handle);
   }
 
   std::optional<NVFBC_GET_STATUS_PARAMS> status() {
     NVFBC_GET_STATUS_PARAMS params { NVFBC_GET_STATUS_PARAMS_VER };
 
-    auto status = func.nvFBCGetStatus(session.el, &params);
+    auto status = func.nvFBCGetStatus(handle, &params);
     if(status) {
-      BOOST_LOG(error) << "Failed to create session: "sv << last_error();
+      BOOST_LOG(error) << "Failed to get NvFBC status: "sv << last_error();
 
       return std::nullopt;
     }
@@ -468,23 +301,328 @@ public:
     return params;
   }
 
-  session_t session;
+  int capture(NVFBC_CREATE_CAPTURE_SESSION_PARAMS &capture_params) {
+    if(func.nvFBCCreateCaptureSession(handle, &capture_params)) {
+      BOOST_LOG(error) << "Failed to start capture session: "sv << last_error();
+      return -1;
+    }
+
+    handle_flags[SESSION_CAPTURE] = true;
+
+    NVFBC_TOCUDA_SETUP_PARAMS setup_params {
+      NVFBC_TOCUDA_SETUP_PARAMS_VER,
+      NVFBC_BUFFER_FORMAT_BGRA,
+    };
+
+    if(func.nvFBCToCudaSetUp(handle, &setup_params)) {
+      BOOST_LOG(error) << "Failed to setup cuda interop with nvFBC: "sv << last_error();
+      return -1;
+    }
+    return 0;
+  }
+
+  int stop() {
+    if(!handle_flags[SESSION_CAPTURE]) {
+      return 0;
+    }
+
+    NVFBC_DESTROY_CAPTURE_SESSION_PARAMS params { NVFBC_DESTROY_CAPTURE_SESSION_PARAMS_VER };
+
+    if(func.nvFBCDestroyCaptureSession(handle, &params)) {
+      BOOST_LOG(error) << "Couldn't destroy capture session: "sv << last_error();
+
+      return -1;
+    }
+
+    handle_flags[SESSION_CAPTURE] = false;
+
+    return 0;
+  }
+
+  ~handle_t() {
+    if(!handle_flags[SESSION_HANDLE]) {
+      return;
+    }
+
+    if(handle_flags[SESSION_CAPTURE]) {
+      NVFBC_DESTROY_CAPTURE_SESSION_PARAMS params { NVFBC_DESTROY_CAPTURE_SESSION_PARAMS_VER };
+
+      if(func.nvFBCDestroyCaptureSession(handle, &params)) {
+        BOOST_LOG(error) << "Couldn't destroy capture session: "sv << func.nvFBCGetLastErrorStr(handle);
+      }
+    }
+
+    NVFBC_DESTROY_HANDLE_PARAMS params { NVFBC_DESTROY_HANDLE_PARAMS_VER };
+
+    if(func.nvFBCDestroyHandle(handle, &params)) {
+      BOOST_LOG(error) << "Couldn't destroy session handle: "sv << func.nvFBCGetLastErrorStr(handle);
+    }
+  }
+
+  std::bitset<MAX_FLAGS> handle_flags;
+
+  NVFBC_SESSION_HANDLE handle;
 };
 
+class display_t : public platf::display_t {
+public:
+  int init(const std::string_view &display_name, int framerate) {
+    auto handle = handle_t::make();
+    if(!handle) {
+      return -1;
+    }
+
+    auto status_params = handle->status();
+    if(!status_params) {
+      return -1;
+    }
+
+    int streamedMonitor = -1;
+    if(!display_name.empty()) {
+      if(status_params->bXRandRAvailable) {
+        auto monitor_nr = util::from_view(display_name);
+
+        if(monitor_nr < 0 || monitor_nr >= status_params->dwOutputNum) {
+          BOOST_LOG(warning) << "Can't stream monitor ["sv << monitor_nr << "], it needs to be between [0] and ["sv << status_params->dwOutputNum - 1 << "], defaulting to virtual desktop"sv;
+        }
+        else {
+          streamedMonitor = monitor_nr;
+        }
+      }
+      else {
+        BOOST_LOG(warning) << "XrandR not available, streaming entire virtual desktop"sv;
+      }
+    }
+
+    capture_params = NVFBC_CREATE_CAPTURE_SESSION_PARAMS { NVFBC_CREATE_CAPTURE_SESSION_PARAMS_VER };
+
+    capture_params.eCaptureType                = NVFBC_CAPTURE_SHARED_CUDA;
+    capture_params.bDisableAutoModesetRecovery = nv_bool(true);
+
+    capture_params.dwSamplingRateMs = 1000 /* ms */ / framerate;
+
+    if(streamedMonitor != -1) {
+      auto &output = status_params->outputs[streamedMonitor];
+
+      width    = output.trackedBox.w;
+      height   = output.trackedBox.h;
+      offset_x = output.trackedBox.x;
+      offset_y = output.trackedBox.y;
+
+      capture_params.eTrackingType = NVFBC_TRACKING_OUTPUT;
+      capture_params.dwOutputId    = output.dwId;
+    }
+    else {
+      capture_params.eTrackingType = NVFBC_TRACKING_SCREEN;
+
+      width  = status_params->screenSize.w;
+      height = status_params->screenSize.h;
+    }
+
+    env_width  = status_params->screenSize.w;
+    env_height = status_params->screenSize.h;
+
+    this->handle = std::move(*handle);
+    return 0;
+  }
+
+  platf::capture_e capture(snapshot_cb_t &&snapshot_cb, std::shared_ptr<platf::img_t> img, bool *cursor) override {
+    // Force display_t::capture to initialize handle_t::capture
+    cursor_visible = !*cursor;
+
+    auto fg = util::fail_guard([&]() {
+      handle.stop();
+    });
+
+    while(img) {
+      auto status = snapshot(img.get(), 500ms, *cursor);
+      switch(status) {
+      case platf::capture_e::reinit:
+      case platf::capture_e::error:
+        return status;
+      case platf::capture_e::timeout:
+        std::this_thread::sleep_for(1ms);
+        continue;
+      case platf::capture_e::ok:
+        img = snapshot_cb(img);
+        break;
+      default:
+        BOOST_LOG(error) << "Unrecognized capture status ["sv << (int)status << ']';
+        return status;
+      }
+    }
+
+    return platf::capture_e::ok;
+  }
+
+  // Reinitialize the capture session.
+  platf::capture_e reinit(bool cursor) {
+    if(handle.stop()) {
+      return platf::capture_e::error;
+    }
+
+    cursor_visible = cursor;
+    if(false && cursor) {
+      capture_params.bPushModel          = nv_bool(false);
+      capture_params.bWithCursor         = nv_bool(true);
+      capture_params.bAllowDirectCapture = nv_bool(false);
+    }
+    else {
+      capture_params.bPushModel          = nv_bool(true);
+      capture_params.bWithCursor         = nv_bool(false);
+      capture_params.bAllowDirectCapture = nv_bool(true);
+    }
+
+    if(handle.capture(capture_params)) {
+      return platf::capture_e::error;
+    }
+
+    // If trying to capture directly, test if it actually does.
+    if(capture_params.bAllowDirectCapture) {
+      CUdeviceptr device_ptr;
+      NVFBC_FRAME_GRAB_INFO info;
+
+      NVFBC_TOCUDA_GRAB_FRAME_PARAMS grab {
+        NVFBC_TOCUDA_GRAB_FRAME_PARAMS_VER,
+        NVFBC_TOCUDA_GRAB_FLAGS_NOWAIT_IF_NEW_FRAME_READY,
+        &device_ptr,
+        &info,
+        0,
+      };
+
+      // Direct Capture may fail the first few times, even if it's possible
+      for(int x = 0; x < 3; ++x) {
+        if(auto status = func.nvFBCToCudaGrabFrame(handle.handle, &grab)) {
+          if(status == NVFBC_ERR_MUST_RECREATE) {
+            return platf::capture_e::reinit;
+          }
+
+          BOOST_LOG(error) << "Couldn't capture nvFramebuffer: "sv << handle.last_error();
+
+          return platf::capture_e::error;
+        }
+
+        if(info.bDirectCapture) {
+          break;
+        }
+
+        BOOST_LOG(debug) << "Direct capture failed attempt ["sv << x << ']';
+      }
+
+      if(!info.bDirectCapture) {
+        BOOST_LOG(debug) << "Direct capture failed, trying the extra copy method"sv;
+        // Direct capture failed
+        capture_params.bPushModel          = nv_bool(false);
+        capture_params.bWithCursor         = nv_bool(false);
+        capture_params.bAllowDirectCapture = nv_bool(false);
+
+        if(handle.stop() || handle.capture(capture_params)) {
+          return platf::capture_e::error;
+        }
+      }
+    }
+
+    return platf::capture_e::ok;
+  }
+
+  platf::capture_e snapshot(platf::img_t *img, std::chrono::milliseconds timeout, bool cursor) {
+    if(cursor != cursor_visible) {
+      auto status = reinit(cursor);
+      if(status != platf::capture_e::ok) {
+        return status;
+      }
+    }
+
+    CUdeviceptr device_ptr;
+    NVFBC_FRAME_GRAB_INFO info;
+
+    NVFBC_TOCUDA_GRAB_FRAME_PARAMS grab {
+      NVFBC_TOCUDA_GRAB_FRAME_PARAMS_VER,
+      NVFBC_TOCUDA_GRAB_FLAGS_NOWAIT_IF_NEW_FRAME_READY,
+      &device_ptr,
+      &info,
+      (std::uint32_t)timeout.count(),
+    };
+
+    if(auto status = func.nvFBCToCudaGrabFrame(handle.handle, &grab)) {
+      if(status == NVFBC_ERR_MUST_RECREATE) {
+        return platf::capture_e::reinit;
+      }
+
+      BOOST_LOG(error) << "Couldn't capture nvFramebuffer: "sv << handle.last_error();
+      return platf::capture_e::error;
+    }
+
+    if(!info.bIsNewFrame) {
+      return platf::capture_e::timeout;
+    }
+
+    if(((img_t *)img)->tex.copy((std::uint8_t *)device_ptr, img->height, img->row_pitch)) {
+      return platf::capture_e::error;
+    }
+
+    return platf::capture_e::ok;
+  }
+
+  std::shared_ptr<platf::hwdevice_t> make_hwdevice(platf::pix_fmt_e pix_fmt) override {
+    return ::cuda::make_hwdevice(width, height, true);
+  }
+
+  std::shared_ptr<platf::img_t> alloc_img() override {
+    auto img = std::make_shared<cuda::img_t>();
+
+    img->width       = width;
+    img->height      = height;
+    img->pixel_pitch = 4;
+    img->row_pitch   = img->width * img->pixel_pitch;
+
+    auto tex_opt = tex_t::make(height, width * img->pixel_pitch);
+    if(!tex_opt) {
+      return nullptr;
+    }
+
+    img->tex  = std::move(*tex_opt);
+    img->data = (std::uint8_t *)img->tex.texture;
+
+    return img;
+  };
+
+  int dummy_img(platf::img_t *) override {
+    return 0;
+  }
+
+  bool cursor_visible;
+  handle_t handle;
+
+  NVFBC_CREATE_CAPTURE_SESSION_PARAMS capture_params;
+};
+} // namespace nvfbc
+} // namespace cuda
+
+namespace platf {
+std::shared_ptr<display_t> nvfbc_display(mem_type_e hwdevice_type, const std::string &display_name, int framerate) {
+  if(hwdevice_type != mem_type_e::cuda) {
+    BOOST_LOG(error) << "Could not initialize nvfbc display with the given hw device type"sv;
+    return nullptr;
+  }
+
+  auto display = std::make_shared<cuda::nvfbc::display_t>();
+
+  if(display->init(display_name, framerate)) {
+    return nullptr;
+  }
+
+  return display;
+}
+
 std::vector<std::string> nvfbc_display_names() {
-  if(init()) {
+  if(cuda::init() || cuda::nvfbc::init()) {
     return {};
   }
 
   std::vector<std::string> display_names;
 
-  auto status = createInstance(&func);
-  if(status) {
-    BOOST_LOG(error) << "Unable to create NvFBC instance"sv;
-    return {};
-  }
-
-  auto handle = handle_t::make();
+  auto handle = cuda::nvfbc::handle_t::make();
   if(!handle) {
     return {};
   }
@@ -500,7 +638,18 @@ std::vector<std::string> nvfbc_display_names() {
 
   BOOST_LOG(info) << "Found ["sv << status_params->dwOutputNum << "] outputs"sv;
   BOOST_LOG(info) << "Virtual Desktop: "sv << status_params->screenSize.w << 'x' << status_params->screenSize.h;
+  BOOST_LOG(info) << "XrandR: "sv << (status_params->bXRandRAvailable ? "available"sv : "unavailable"sv);
+
+  for(auto x = 0; x < status_params->dwOutputNum; ++x) {
+    auto &output = status_params->outputs[x];
+    BOOST_LOG(info) << "-- Output --"sv;
+    BOOST_LOG(debug) << "  ID: "sv << output.dwId;
+    BOOST_LOG(debug) << "  Name: "sv << output.name;
+    BOOST_LOG(info) << "  Resolution: "sv << output.trackedBox.w << 'x' << output.trackedBox.h;
+    BOOST_LOG(info) << "  Offset: "sv << output.trackedBox.x << 'x' << output.trackedBox.y;
+    display_names.emplace_back(std::to_string(x));
+  }
 
   return display_names;
 }
-} // namespace platf::nvfbc
\ No newline at end of file
+} // namespace platf
\ No newline at end of file
diff --git a/sunshine/platform/linux/cuda.cu b/sunshine/platform/linux/cuda.cu
index e310964c..68368095 100644
--- a/sunshine/platform/linux/cuda.cu
+++ b/sunshine/platform/linux/cuda.cu
@@ -21,6 +21,9 @@ using namespace std::literals;
 #define CU_CHECK_PTR(x, y) \
   if(check((x), SUNSHINE_STRINGVIEW(y ": "))) return nullptr;
 
+#define CU_CHECK_OPT(x, y) \
+  if(check((x), SUNSHINE_STRINGVIEW(y ": "))) return std::nullopt;
+
 #define CU_CHECK_IGNORE(x, y) \
   check((x), SUNSHINE_STRINGVIEW(y ": "))
 
@@ -165,15 +168,21 @@ __global__ void RGBA_to_NV12(
   dstY[1]  = calcY(rgb_r, color_matrix) * 245.0f; // 245.0f is a magic number to ensure slight changes in luminosity are more visisble
 }
 
-sws_t::sws_t(int in_width, int in_height, int out_width, int out_height, int pitch, int threadsPerBlock, ptr_t &&color_matrix)
-    : array {}, texture { INVALID_TEXTURE }, threadsPerBlock { threadsPerBlock }, color_matrix { std::move(color_matrix) } {
-  auto format = cudaCreateChannelDesc<uchar4>();
+int tex_t::copy(std::uint8_t *src, int height, int pitch) {
+  CU_CHECK(cudaMemcpy2DToArray(array, 0, 0, src, pitch, pitch, height, cudaMemcpyDeviceToDevice), "Couldn't copy to cuda array from deviceptr");
 
-  CU_CHECK_VOID(cudaMallocArray(&array, &format, pitch, in_height, cudaArrayDefault), "Couldn't allocate cuda array");
+  return 0;
+}
+
+std::optional<tex_t> tex_t::make(int height, int pitch) {
+  tex_t tex;
+
+  auto format = cudaCreateChannelDesc<uchar4>();
+  CU_CHECK_OPT(cudaMallocArray(&tex.array, &format, pitch, height, cudaArrayDefault), "Couldn't allocate cuda array");
 
   cudaResourceDesc res {};
   res.resType         = cudaResourceTypeArray;
-  res.res.array.array = array;
+  res.res.array.array = tex.array;
 
   cudaTextureDesc desc {};
 
@@ -183,9 +192,40 @@ sws_t::sws_t(int in_width, int in_height, int out_width, int out_height, int pit
 
   std::fill_n(std::begin(desc.addressMode), 2, cudaAddressModeClamp);
 
-  CU_CHECK_VOID(cudaCreateTextureObject(&texture, &res, &desc, nullptr), "Couldn't create cuda texture");
+  CU_CHECK_OPT(cudaCreateTextureObject(&tex.texture, &res, &desc, nullptr), "Couldn't create cuda texture");
 
+  return std::move(tex);
+}
 
+tex_t::tex_t() : array { }, texture { INVALID_TEXTURE } {}
+tex_t::tex_t(tex_t &&other) : array { other.array }, texture { other.texture } {
+  other.array = 0;
+  other.texture = INVALID_TEXTURE;
+}
+
+tex_t &tex_t::operator=(tex_t &&other) {
+  std::swap(array, other.array);
+  std::swap(texture, other.texture);
+
+  return *this;
+}
+
+tex_t::~tex_t() {
+  if(texture != INVALID_TEXTURE) {
+    CU_CHECK_IGNORE(cudaDestroyTextureObject(texture), "Couldn't deallocate cuda texture");
+
+    texture = INVALID_TEXTURE;
+  }
+
+  if(array) {
+    CU_CHECK_IGNORE(cudaFreeArray(array), "Couldn't deallocate cuda array");
+
+    array = cudaArray_t {};
+  }
+}
+
+sws_t::sws_t(int in_width, int in_height, int out_width, int out_height, int pitch, int threadsPerBlock, ptr_t &&color_matrix)
+    : threadsPerBlock { threadsPerBlock }, color_matrix { std::move(color_matrix) } {
   // Ensure aspect ratio is maintained
   auto scalar       = std::fminf(out_width / (float)in_width, out_height / (float)in_height);
   auto out_width_f  = in_width * scalar;
@@ -202,52 +242,32 @@ sws_t::sws_t(int in_width, int in_height, int out_width, int out_height, int pit
   viewport.offsetY = offsetY_f;
 }
 
-sws_t::~sws_t() {
-  if(texture != INVALID_TEXTURE) {
-    CU_CHECK_IGNORE(cudaDestroyTextureObject(texture), "Couldn't deallocate cuda texture");
-
-    texture = INVALID_TEXTURE;
-  }
-
-  if(array) {
-    CU_CHECK_IGNORE(cudaFreeArray(array), "Couldn't deallocate cuda array");
-
-    array = cudaArray_t {};
-  }
-}
-
-std::unique_ptr<sws_t> sws_t::make(int in_width, int in_height, int out_width, int out_height, int pitch) {
+std::optional<sws_t> sws_t::make(int in_width, int in_height, int out_width, int out_height, int pitch) {
   cudaDeviceProp props;
   int device;
-  CU_CHECK_PTR(cudaGetDevice(&device), "Couldn't get cuda device");
-  CU_CHECK_PTR(cudaGetDeviceProperties(&props, device), "Couldn't get cuda device properties");
+  CU_CHECK_OPT(cudaGetDevice(&device), "Couldn't get cuda device");
+  CU_CHECK_OPT(cudaGetDeviceProperties(&props, device), "Couldn't get cuda device properties");
 
   auto ptr = make_ptr<video::color_t>();
   if(!ptr) {
-    return nullptr;
+    return std::nullopt;
   }
 
-  auto sws = std::make_unique<sws_t>(in_width, in_height, out_width, out_height, pitch, props.maxThreadsPerMultiProcessor / props.maxBlocksPerMultiProcessor / 2, std::move(ptr));
-
-  if(sws->texture == INVALID_TEXTURE) {
-    return nullptr;
-  }
-
-  return sws;
+  return std::make_optional<sws_t>(in_width, in_height, out_width, out_height, pitch, props.maxThreadsPerMultiProcessor / props.maxBlocksPerMultiProcessor / 2, std::move(ptr));
 }
 
-int sws_t::convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV) {
-  return convert(Y, UV, pitchY, pitchUV, viewport);
+int sws_t::convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, cudaTextureObject_t texture) {
+  return convert(Y, UV, pitchY, pitchUV, texture, viewport);
 }
 
-int sws_t::convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, const viewport_t &viewport) {
+int sws_t::convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, cudaTextureObject_t texture, const viewport_t &viewport) {
   int threadsX = viewport.width / 2;
   int threadsY = viewport.height;
 
   dim3 block(threadsPerBlock, threadsPerBlock);
   dim3 grid(div_align(threadsX, threadsPerBlock), div_align(threadsY, threadsPerBlock));
 
-  RGBA_to_NV12<<<block, grid>>>(texture, Y, UV, pitchY, pitchUV, viewport, (video::color_t*)color_matrix.get());
+  RGBA_to_NV12<<<block, grid>>>(texture, Y, UV, pitchY, pitchUV, viewport, (video::color_t *)color_matrix.get());
 
   return CU_CHECK_IGNORE(cudaGetLastError(), "RGBA_to_NV12 failed");
 }
@@ -274,7 +294,7 @@ void sws_t::set_colorspace(std::uint32_t colorspace, std::uint32_t color_range)
   CU_CHECK_IGNORE(cudaMemcpy(color_matrix.get(), color_p, sizeof(video::color_t), cudaMemcpyHostToDevice), "Couldn't copy color matrix to cuda");
 }
 
-int sws_t::load_ram(platf::img_t &img) {
+int sws_t::load_ram(platf::img_t &img, cudaArray_t array) {
   return CU_CHECK_IGNORE(cudaMemcpy2DToArray(array, 0, 0, img.data, img.row_pitch, img.width * img.pixel_pitch, img.height, cudaMemcpyHostToDevice), "Couldn't copy to cuda array");
 }
 
diff --git a/sunshine/platform/linux/cuda.h b/sunshine/platform/linux/cuda.h
index 27dede07..08cb0156 100644
--- a/sunshine/platform/linux/cuda.h
+++ b/sunshine/platform/linux/cuda.h
@@ -1,15 +1,16 @@
 #ifndef SUNSHINE_PLATFORM_CUDA_H
 #define SUNSHINE_PLATFORM_CUDA_H
 
-#include <memory>
-
 #ifndef __NVCC__
 
 #include "sunshine/platform/common.h"
 #include "x11grab.h"
 
 namespace cuda {
-std::shared_ptr<platf::hwdevice_t> make_hwdevice(int width, int height, platf::x11::xdisplay_t::pointer xdisplay);
+namespace nvfbc {
+std::vector<std::string> display_names();
+}
+std::shared_ptr<platf::hwdevice_t> make_hwdevice(int width, int height, bool vram);
 int init();
 } // namespace cuda
 
@@ -41,9 +42,26 @@ struct viewport_t {
   int offsetX, offsetY;
 };
 
+class tex_t {
+public:
+  static std::optional<tex_t> make(int height, int pitch);
+
+  tex_t();
+  tex_t(tex_t &&);
+
+  tex_t &operator=(tex_t &&other);
+
+  ~tex_t();
+
+  int copy(std::uint8_t *src, int height, int pitch);
+
+  cudaArray_t array;
+  cudaTextureObject_t texture;
+};
+
 class sws_t {
 public:
-  ~sws_t();
+  sws_t() = default;
   sws_t(int in_width, int in_height, int out_width, int out_height, int pitch, int threadsPerBlock, ptr_t &&color_matrix);
 
   /**
@@ -52,19 +70,17 @@ public:
    * 
    * pitch -- The size of a single row of pixels in bytes
    */
-  static std::unique_ptr<sws_t> make(int in_width, int in_height, int out_width, int out_height, int pitch);
+  static std::optional<sws_t> make(int in_width, int in_height, int out_width, int out_height, int pitch);
 
   // Converts loaded image into a CUDevicePtr
-  int convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV);
-  int convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, const viewport_t &viewport);
+  int convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, cudaTextureObject_t texture);
+  int convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV, cudaTextureObject_t texture, const viewport_t &viewport);
 
   void set_colorspace(std::uint32_t colorspace, std::uint32_t color_range);
 
-  int load_ram(platf::img_t &img);
+  int load_ram(platf::img_t &img, cudaArray_t array);
 
   ptr_t color_matrix;
-  cudaArray_t array;
-  cudaTextureObject_t texture;
 
   int threadsPerBlock;
 
diff --git a/sunshine/platform/linux/misc.cpp b/sunshine/platform/linux/misc.cpp
index ad5afc5a..e10242ff 100644
--- a/sunshine/platform/linux/misc.cpp
+++ b/sunshine/platform/linux/misc.cpp
@@ -141,6 +141,9 @@ std::string get_mac_address(const std::string_view &address) {
 }
 
 enum class source_e {
+  // #ifdef SUNSHINE_BUILD_CUDA
+  NVFBC,
+// #endif
 #ifdef SUNSHINE_BUILD_WAYLAND
   WAYLAND,
 #endif
@@ -153,6 +156,15 @@ enum class source_e {
 };
 static source_e source;
 
+// #ifdef SUNSHINE_BUILD_CUDA
+std::vector<std::string> nvfbc_display_names();
+std::shared_ptr<display_t> nvfbc_display(mem_type_e hwdevice_type, const std::string &display_name, int framerate);
+
+bool verify_nvfbc() {
+  return !nvfbc_display_names().empty();
+}
+// #endif
+
 #ifdef SUNSHINE_BUILD_WAYLAND
 std::vector<std::string> wl_display_names();
 std::shared_ptr<display_t> wl_display(mem_type_e hwdevice_type, const std::string &display_name, int framerate);
@@ -182,6 +194,10 @@ bool verify_x11() {
 
 std::vector<std::string> display_names() {
   switch(source) {
+  // #ifdef SUNSHINE_BUILD_CUDA
+  case source_e::NVFBC:
+    return nvfbc_display_names();
+    // #endif
 #ifdef SUNSHINE_BUILD_WAYLAND
   case source_e::WAYLAND:
     return wl_display_names();
@@ -201,6 +217,10 @@ std::vector<std::string> display_names() {
 
 std::shared_ptr<display_t> display(mem_type_e hwdevice_type, const std::string &display_name, int framerate) {
   switch(source) {
+    // #ifdef SUNSHINE_BUILD_CUDA
+  case source_e::NVFBC:
+    return nvfbc_display(hwdevice_type, display_name, framerate);
+    // #endif
 #ifdef SUNSHINE_BUILD_WAYLAND
   case source_e::WAYLAND:
     return wl_display(hwdevice_type, display_name, framerate);
@@ -229,7 +249,7 @@ std::unique_ptr<deinit_t> init() {
     window_system = window_system_e::WAYLAND;
   }
 #endif
-#ifdef SUNSHINE_BUILD_X11
+#if defined(SUNSHINE_BUILD_X11) // || defined(SUNSHINE_BUILD_CUDA)
   if(std::getenv("DISPLAY") && window_system != window_system_e::WAYLAND) {
     if(std::getenv("WAYLAND_DISPLAY")) {
       BOOST_LOG(warning) << "Wayland detected, yet sunshine will use X11 for screencasting, screencasting will only work on XWayland applications"sv;
@@ -238,6 +258,13 @@ std::unique_ptr<deinit_t> init() {
     window_system = window_system_e::X11;
   }
 #endif
+// #ifdef SUNSHINE_BUILD_CUDA
+  if(verify_nvfbc()) {
+    BOOST_LOG(info) << "Using nvFBC for screencasting"sv;
+    source = source_e::NVFBC;
+    goto found_source;
+  }
+// #endif
 #ifdef SUNSHINE_BUILD_WAYLAND
   if(verify_wl()) {
     BOOST_LOG(info) << "Using Wayland for screencasting"sv;
diff --git a/sunshine/platform/linux/x11grab.cpp b/sunshine/platform/linux/x11grab.cpp
index c8bc70f0..decf24aa 100644
--- a/sunshine/platform/linux/x11grab.cpp
+++ b/sunshine/platform/linux/x11grab.cpp
@@ -518,7 +518,7 @@ struct x11_attr_t : public display_t {
     }
 
     if(mem_type == mem_type_e::cuda) {
-      return cuda::make_hwdevice(width, height, xdisplay.get());
+      return cuda::make_hwdevice(width, height, false);
     }
 
     return std::make_shared<hwdevice_t>();
@@ -678,7 +678,7 @@ struct shm_attr_t : public x11_attr_t {
 
 std::shared_ptr<display_t> x11_display(platf::mem_type_e hwdevice_type, const std::string &display_name, int framerate) {
   if(hwdevice_type != platf::mem_type_e::system && hwdevice_type != platf::mem_type_e::vaapi && hwdevice_type != platf::mem_type_e::cuda) {
-    BOOST_LOG(error) << "Could not initialize display with the given hw device type."sv;
+    BOOST_LOG(error) << "Could not initialize x11 display with the given hw device type"sv;
     return nullptr;
   }
 
diff --git a/sunshine/video.cpp b/sunshine/video.cpp
index 3b143cbe..6faf82ee 100644
--- a/sunshine/video.cpp
+++ b/sunshine/video.cpp
@@ -1699,7 +1699,7 @@ util::Either<buffer_t, int> vaapi_make_hwdevice_ctx(platf::hwdevice_t *base) {
 util::Either<buffer_t, int> cuda_make_hwdevice_ctx(platf::hwdevice_t *base) {
   buffer_t hw_device_buf;
 
-  auto status = av_hwdevice_ctx_create(&hw_device_buf, AV_HWDEVICE_TYPE_CUDA, nullptr, nullptr, 0);
+  auto status = av_hwdevice_ctx_create(&hw_device_buf, AV_HWDEVICE_TYPE_CUDA, nullptr, nullptr, 1 /* AV_CUDA_USE_PRIMARY_CONTEXT */);
   if(status < 0) {
     char string[AV_ERROR_MAX_STRING_SIZE];
     BOOST_LOG(error) << "Failed to create a CUDA device: "sv << av_make_error_string(string, AV_ERROR_MAX_STRING_SIZE, status);