From fed329568ca44993d9cc0a8fe145e73a332fb111 Mon Sep 17 00:00:00 2001
From: loki-47-6F-64 <loki-47-6F-64@users.noreply.github.com>
Date: Sun, 19 Sep 2021 20:40:34 +0200
Subject: [PATCH] Use an actual cuda kernel to convert RGB to NV12

---
 CMakeLists.txt                   |   39 +-
 sunshine/platform/linux/cuda.cpp |  212 +++-
 sunshine/platform/linux/cuda.cu  |  248 ++++
 sunshine/platform/linux/cuda.h   |   46 +
 sunshine/video.cpp               |   26 +-
 third-party/nvfbc/NvFBC.h        | 2006 ++++++++++++++++++++++++++++++
 third-party/nvfbc/helper_math.h  | 1469 ++++++++++++++++++++++
 7 files changed, 4007 insertions(+), 39 deletions(-)
 create mode 100644 sunshine/platform/linux/cuda.cu
 create mode 100644 third-party/nvfbc/NvFBC.h
 create mode 100644 third-party/nvfbc/helper_math.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1b04f96f..df0424d5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -4,15 +4,6 @@ project(Sunshine)
 
 set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
 
-add_subdirectory(third-party/Simple-Web-Server)
-
-set(UPNPC_BUILD_SHARED OFF CACHE BOOL "no shared libraries")
-set(UPNPC_BUILD_TESTS OFF CACHE BOOL "Don't build tests for miniupnpc")
-set(UPNPC_BUILD_SAMPLE OFF CACHE BOOL "Don't build samples for miniupnpc")
-set(UPNPC_NO_INSTALL ON CACHE BOOL "Don't install any libraries build for miniupnpc")
-add_subdirectory(third-party/miniupnp/miniupnpc)
-include_directories(third-party/miniupnp)
-
 if(WIN32)
 	# Ugly hack to compile with #include <qos2.h>
 	add_compile_definitions(
@@ -21,9 +12,20 @@ if(WIN32)
 		QOS_NON_ADAPTIVE_FLOW=2)
 endif()
 add_subdirectory(third-party/moonlight-common-c/enet)
+add_subdirectory(third-party/Simple-Web-Server)
+add_subdirectory(third-party/cbs)
+
+set(UPNPC_BUILD_SHARED OFF CACHE BOOL "no shared libraries")
+set(UPNPC_BUILD_TESTS OFF CACHE BOOL "Don't build tests for miniupnpc")
+set(UPNPC_BUILD_SAMPLE OFF CACHE BOOL "Don't build samples for miniupnpc")
+set(UPNPC_NO_INSTALL ON CACHE BOOL "Don't install any libraries build for miniupnpc")
+add_subdirectory(third-party/miniupnp/miniupnpc)
+include_directories(third-party/miniupnp)
 
 find_package(Threads REQUIRED)
 find_package(OpenSSL REQUIRED)
+set(Boost_USE_STATIC_LIBS ON)
+find_package(Boost COMPONENTS log filesystem REQUIRED)
 
 list(APPEND SUNSHINE_COMPILE_OPTIONS -fPIC -Wall -Wno-missing-braces -Wno-maybe-uninitialized -Wno-sign-compare)
 
@@ -106,6 +108,11 @@ else()
 	option(SUNSHINE_ENABLE_X11 "Enable X11 grab if available" ON)
 	option(SUNSHINE_ENABLE_WAYLAND "Enable building wayland specific code" ON)
 
+	if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+  	set(CMAKE_CUDA_ARCHITECTURES 75)
+	endif()
+	enable_language(CUDA)
+
 	if(${SUNSHINE_ENABLE_X11})
 		find_package(X11)
 	else()
@@ -188,6 +195,7 @@ else()
 		sunshine/platform/linux/publish.cpp
 		sunshine/platform/linux/vaapi.h
 		sunshine/platform/linux/vaapi.cpp
+		sunshine/platform/linux/cuda.cu
 		sunshine/platform/linux/cuda.cpp
 		sunshine/platform/linux/cuda.h
 		sunshine/platform/linux/graphics.h
@@ -203,7 +211,8 @@ else()
 		third-party/glad/include/EGL/eglplatform.h
 		third-party/glad/include/KHR/khrplatform.h
 		third-party/glad/include/glad/gl.h
-		third-party/glad/include/glad/egl.h)
+		third-party/glad/include/glad/egl.h
+		third-party/nvfbc/NvFBC.h)
 		
 	list(APPEND PLATFORM_LIBRARIES
 		dl
@@ -215,7 +224,8 @@ else()
 	include_directories(
 		/usr/include/libevdev-1.0
 		third-party/nv-codec-headers/include
-		third-party/glad/include)
+		third-party/glad/include
+		third-party/nvfbc)
 
 	if(NOT DEFINED SUNSHINE_EXECUTABLE_PATH)
 		set(SUNSHINE_EXECUTABLE_PATH "sunshine")
@@ -224,11 +234,6 @@ else()
 	configure_file(sunshine.service.in sunshine.service @ONLY)
 endif()
 
-add_subdirectory(third-party/cbs)
-
-set(Boost_USE_STATIC_LIBS ON)
-find_package(Boost COMPONENTS log filesystem REQUIRED)
-
 set(SUNSHINE_TARGET_FILES
 	third-party/moonlight-common-c/reedsolomon/rs.c
 	third-party/moonlight-common-c/reedsolomon/rs.h
@@ -290,7 +295,7 @@ include_directories(
 
 string(TOUPPER "x${CMAKE_BUILD_TYPE}" BUILD_TYPE)
 if("${BUILD_TYPE}" STREQUAL "XDEBUG")
-	list(APPEND SUNSHINE_COMPILE_OPTIONS -O0 -pedantic -ggdb3)
+	list(APPEND SUNSHINE_COMPILE_OPTIONS -O0 -ggdb3)
 	if(WIN32)
 		set_source_files_properties(sunshine/nvhttp.cpp PROPERTIES COMPILE_FLAGS -O2)
 	endif()
diff --git a/sunshine/platform/linux/cuda.cpp b/sunshine/platform/linux/cuda.cpp
index cdcaba57..811293d6 100644
--- a/sunshine/platform/linux/cuda.cpp
+++ b/sunshine/platform/linux/cuda.cpp
@@ -1,9 +1,4 @@
-#include "cuda.h"
-#include "graphics.h"
-#include "sunshine/main.h"
-#include "sunshine/utility.h"
-#include "wayland.h"
-#include "x11grab.h"
+#include <NvFBC.h>
 #include <ffnvcodec/dynlink_loader.h>
 
 extern "C" {
@@ -12,6 +7,13 @@ extern "C" {
 #include <libavutil/imgutils.h>
 }
 
+#include "cuda.h"
+#include "graphics.h"
+#include "sunshine/main.h"
+#include "sunshine/utility.h"
+#include "wayland.h"
+#include "x11grab.h"
+
 #define SUNSHINE_STRINGVIEW_HELPER(x) x##sv
 #define SUNSHINE_STRINGVIEW(x) SUNSHINE_STRINGVIEW_HELPER(x)
 
@@ -23,6 +25,13 @@ extern "C" {
 
 using namespace std::literals;
 namespace cuda {
+constexpr auto cudaDevAttrMaxThreadsPerBlock          = (CUdevice_attribute)1;
+constexpr auto cudaDevAttrMaxThreadsPerMultiProcessor = (CUdevice_attribute)39;
+
+void pass_error(const std::string_view &sv, const char *name, const char *description) {
+  BOOST_LOG(error) << sv << name << ':' << description;
+}
+
 void cff(CudaFunctions *cf) {
   cuda_free_functions(&cf);
 }
@@ -151,7 +160,7 @@ int init() {
   return 0;
 }
 
-class cuda_t : public platf::hwdevice_t {
+class opengl_t : public platf::hwdevice_t {
 public:
   int init(int in_width, int in_height, platf::x11::xdisplay_t::pointer xdisplay) {
     if(!cdf) {
@@ -273,16 +282,203 @@ public:
   int width, height;
 };
 
+class cuda_t : public platf::hwdevice_t {
+public:
+  ~cuda_t() override {
+    // sws_t needs to be destroyed while the context is active
+    if(sws) {
+      ctx_t ctx { cuda_ctx };
+
+      sws.reset();
+    }
+  }
+
+  int init(int in_width, int in_height) {
+    if(!cdf) {
+      BOOST_LOG(warning) << "cuda not initialized"sv;
+      return -1;
+    }
+
+    data = (void *)0x1;
+
+    width  = in_width;
+    height = in_height;
+
+    return 0;
+  }
+
+  int set_frame(AVFrame *frame) override {
+    this->hwframe.reset(frame);
+    this->frame = frame;
+
+    if(((AVHWFramesContext *)frame->hw_frames_ctx->data)->sw_format != AV_PIX_FMT_NV12) {
+      BOOST_LOG(error) << "cuda::cuda_t doesn't support any format other than AV_PIX_FMT_NV12"sv;
+      return -1;
+    }
+
+    if(av_hwframe_get_buffer(frame->hw_frames_ctx, frame, 0)) {
+      BOOST_LOG(error) << "Couldn't get hwframe for NVENC"sv;
+
+      return -1;
+    }
+
+    cuda_ctx = ((AVCUDADeviceContext *)((AVHWFramesContext *)frame->hw_frames_ctx->data)->device_ctx->hwctx)->cuda_ctx;
+
+    ctx_t ctx { cuda_ctx };
+    sws = sws_t::make(width * 4, height, frame->width, frame->height);
+
+    if(!sws) {
+      return -1;
+    }
+
+    return 0;
+  }
+
+  int convert(platf::img_t &img) override {
+    ctx_t ctx { cuda_ctx };
+
+    return sws->load_ram(img) || sws->convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1]);
+  }
+
+  void set_colorspace(std::uint32_t colorspace, std::uint32_t color_range) override {
+    ctx_t ctx { cuda_ctx };
+    sws->set_colorspace(colorspace, color_range);
+  }
+
+  frame_t hwframe;
+
+  std::unique_ptr<sws_t> sws;
+
+  int width, height;
+
+  CUcontext cuda_ctx;
+};
+
 std::shared_ptr<platf::hwdevice_t> make_hwdevice(int width, int height, platf::x11::xdisplay_t::pointer xdisplay) {
   if(init()) {
     return nullptr;
   }
 
   auto cuda = std::make_shared<cuda_t>();
-  if(cuda->init(width, height, xdisplay)) {
+  if(cuda->init(width, height)) {
     return nullptr;
   }
 
   return cuda;
 }
 } // namespace cuda
+
+namespace platf::nvfbc {
+static PNVFBCCREATEINSTANCE createInstance {};
+static NVFBC_API_FUNCTION_LIST func { NVFBC_VERSION };
+
+static void *handle { nullptr };
+int init() {
+  static bool funcs_loaded = false;
+
+  if(funcs_loaded) return 0;
+
+  if(!handle) {
+    handle = dyn::handle({ "libnvidia-fbc.so.1", "libnvidia-fbc.so" });
+    if(!handle) {
+      return -1;
+    }
+  }
+
+  std::vector<std::tuple<dyn::apiproc *, const char *>> funcs {
+    { (dyn::apiproc *)&createInstance, "NvFBCCreateInstance" },
+  };
+
+  if(dyn::load(handle, funcs)) {
+    dlclose(handle);
+    handle = nullptr;
+
+    return -1;
+  }
+
+  funcs_loaded = true;
+  return 0;
+}
+
+class handle_t {
+  KITTY_USING_MOVE_T(session_t, NVFBC_SESSION_HANDLE, std::numeric_limits<std::uint64_t>::max(), {
+    if(el == std::numeric_limits<std::uint64_t>::max()) {
+      return;
+    }
+    NVFBC_DESTROY_HANDLE_PARAMS params { NVFBC_DESTROY_HANDLE_PARAMS_VER };
+
+    auto status = func.nvFBCDestroyHandle(el, &params);
+    if(status) {
+      BOOST_LOG(error) << "Failed to destroy nvfbc handle: "sv << func.nvFBCGetLastErrorStr(el);
+    }
+  });
+
+public:
+  static std::optional<handle_t> make() {
+    NVFBC_CREATE_HANDLE_PARAMS params { NVFBC_CREATE_HANDLE_PARAMS_VER };
+    session_t session;
+
+    auto status = func.nvFBCCreateHandle(&session.el, &params);
+    if(status) {
+      BOOST_LOG(error) << "Failed to create session: "sv << func.nvFBCGetLastErrorStr(session.el);
+      session.release();
+
+      return std::nullopt;
+    }
+
+    return handle_t { std::move(session) };
+  }
+
+  const char *last_error() {
+    return func.nvFBCGetLastErrorStr(session.el);
+  }
+
+  std::optional<NVFBC_GET_STATUS_PARAMS> status() {
+    NVFBC_GET_STATUS_PARAMS params { NVFBC_GET_STATUS_PARAMS_VER };
+
+    auto status = func.nvFBCGetStatus(session.el, &params);
+    if(status) {
+      BOOST_LOG(error) << "Failed to create session: "sv << last_error();
+
+      return std::nullopt;
+    }
+
+    return params;
+  }
+
+  session_t session;
+};
+
+std::vector<std::string> nvfbc_display_names() {
+  if(init()) {
+    return {};
+  }
+
+  std::vector<std::string> display_names;
+
+  auto status = createInstance(&func);
+  if(status) {
+    BOOST_LOG(error) << "Unable to create NvFBC instance"sv;
+    return {};
+  }
+
+  auto handle = handle_t::make();
+  if(!handle) {
+    return {};
+  }
+
+  auto status_params = handle->status();
+  if(!status_params) {
+    return {};
+  }
+
+  if(!status_params->bIsCapturePossible) {
+    BOOST_LOG(error) << "NVidia driver doesn't support NvFBC screencasting"sv;
+  }
+
+  BOOST_LOG(info) << "Found ["sv << status_params->dwOutputNum << "] outputs"sv;
+  BOOST_LOG(info) << "Virtual Desktop: "sv << status_params->screenSize.w << 'x' << status_params->screenSize.h;
+
+  return display_names;
+}
+} // namespace platf::nvfbc
\ No newline at end of file
diff --git a/sunshine/platform/linux/cuda.cu b/sunshine/platform/linux/cuda.cu
new file mode 100644
index 00000000..a2ca6508
--- /dev/null
+++ b/sunshine/platform/linux/cuda.cu
@@ -0,0 +1,248 @@
+// #include <algorithm>
+#include <helper_math.h>
+#include <limits>
+#include <memory>
+#include <optional>
+#include <string_view>
+
+#include "cuda.h"
+
+using namespace std::literals;
+
+#define SUNSHINE_STRINGVIEW_HELPER(x) x##sv
+#define SUNSHINE_STRINGVIEW(x) SUNSHINE_STRINGVIEW_HELPER(x)
+
+#define CU_CHECK(x, y) \
+  if(check((x), SUNSHINE_STRINGVIEW(y ": "))) return -1
+
+#define CU_CHECK_VOID(x, y) \
+  if(check((x), SUNSHINE_STRINGVIEW(y ": "))) return;
+
+#define CU_CHECK_PTR(x, y) \
+  if(check((x), SUNSHINE_STRINGVIEW(y ": "))) return nullptr;
+
+#define CU_CHECK_IGNORE(x, y) \
+  check((x), SUNSHINE_STRINGVIEW(y ": "))
+
+using namespace std::literals;
+
+//////////////////// Special desclarations
+/**
+ * NVCC segfaults when including <chrono>
+ * Therefore, some declarations need to be added explicitely
+ */
+namespace platf {
+struct img_t {
+public:
+  std::uint8_t *data {};
+  std::int32_t width {};
+  std::int32_t height {};
+  std::int32_t pixel_pitch {};
+  std::int32_t row_pitch {};
+
+  virtual ~img_t() = default;
+};
+} // namespace platf
+
+namespace video {
+using __float4 = float[4];
+using __float3 = float[3];
+using __float2 = float[2];
+
+struct __attribute__((__aligned__(16))) color_t {
+  float4 color_vec_y;
+  float4 color_vec_u;
+  float4 color_vec_v;
+  float2 range_y;
+  float2 range_uv;
+};
+
+struct __attribute__((__aligned__(16))) color_extern_t {
+  __float4 color_vec_y;
+  __float4 color_vec_u;
+  __float4 color_vec_v;
+  __float2 range_y;
+  __float2 range_uv;
+};
+
+extern color_extern_t colors[4];
+} // namespace video
+
+//////////////////// End special declarations
+
+namespace cuda {
+auto constexpr INVALID_TEXTURE = std::numeric_limits<cudaTextureObject_t>::max();
+
+template<class T>
+inline T div_align(T l, T r) {
+  return (l + r - 1) / r;
+}
+
+void pass_error(const std::string_view &sv, const char *name, const char *description);
+inline static int check(cudaError_t result, const std::string_view &sv) {
+  if(result) {
+    auto name        = cudaGetErrorName(result);
+    auto description = cudaGetErrorString(result);
+
+    pass_error(sv, name, description);
+    return -1;
+  }
+
+  return 0;
+}
+
+__device__ __constant__ video::color_t color;
+
+
+inline __device__ float3 bgra_to_rgb(uchar4 vec) {
+  return make_float3((float)vec.z, (float)vec.y, (float)vec.x);
+}
+
+inline __device__ float2 calcUV(float3 pixel) {
+  float4 vec_u = color.color_vec_u;
+  float4 vec_v = color.color_vec_v;
+
+  float u = dot(pixel, make_float3(vec_u)) + vec_u.w;
+  float v = dot(pixel, make_float3(vec_v)) + vec_v.w;
+
+  u = u * color.range_uv.x + color.range_uv.y;
+  v = (v * color.range_uv.x + color.range_uv.y) * 224.0f / 256.0f + 0.0625f * 256.0f;
+
+  return make_float2(u, v);
+}
+
+inline __device__ float calcY(float3 pixel) {
+  float4 vec_y = color.color_vec_y;
+
+  return (dot(pixel, make_float3(vec_y)) + vec_y.w) * color.range_y.x + color.range_y.y;
+}
+
+__global__ void RGBA_to_NV12(
+  cudaTextureObject_t srcImage, std::uint8_t *dstY, std::uint8_t *dstUV,
+  std::uint32_t dstPitchY, std::uint32_t dstPitchUV,
+  std::uint32_t width, std::uint32_t height) {
+
+  int idX = (threadIdx.x + blockDim.x * blockIdx.x) * 2;
+  int idY = (threadIdx.y + blockDim.y * blockIdx.y);
+
+  if(idX >= width) return;
+  if(idY >= height) return;
+
+  dstY  = dstY + idX + idY * dstPitchY;
+  dstUV = dstUV + idX + (idY / 2 * dstPitchUV);
+
+  float x = (float)idX / (float)width / 4;
+  float y = (float)idY / (float)height;
+
+  float3 rgb_l = bgra_to_rgb(tex2D<uchar4>(srcImage, x, y));
+  float3 rgb_r = bgra_to_rgb(tex2D<uchar4>(srcImage, x + 0.25f / width, y + 1.0f / height));
+
+  float2 uv = calcUV((rgb_l + rgb_r) * 0.5f);
+
+  dstUV[0] = uv.x;
+  dstUV[1] = uv.y;
+  dstY[0]  = calcY(rgb_l);
+  dstY[1]  = calcY(rgb_r);
+}
+
+sws_t::sws_t(int in_width, int in_height, int out_width, int out_height, int threadsPerBlock)
+    : array {}, texture { INVALID_TEXTURE }, width { out_width }, height { out_height }, threadsPerBlock { threadsPerBlock } {
+  auto format = cudaCreateChannelDesc<uchar4>();
+
+  CU_CHECK_VOID(cudaMallocArray(&array, &format, in_width, in_height, cudaArrayDefault), "Couldn't allocate cuda array");
+
+  cudaResourceDesc res {};
+  res.resType         = cudaResourceTypeArray;
+  res.res.array.array = array;
+
+  cudaTextureDesc desc {};
+
+  desc.readMode         = cudaReadModeElementType;
+  desc.filterMode       = cudaFilterModePoint;
+  desc.normalizedCoords = true;
+
+  std::fill_n(std::begin(desc.addressMode), 2, cudaAddressModeClamp);
+
+  CU_CHECK_VOID(cudaCreateTextureObject(&texture, &res, &desc, nullptr), "Couldn't create cuda texture");
+}
+
+sws_t::~sws_t() {
+  if(texture != INVALID_TEXTURE) {
+    CU_CHECK_IGNORE(cudaDestroyTextureObject(texture), "Couldn't deallocate cuda texture");
+
+    texture = INVALID_TEXTURE;
+  }
+
+  if(array) {
+    CU_CHECK_IGNORE(cudaFreeArray(array), "Couldn't deallocate cuda array");
+
+    array = cudaArray_t {};
+  }
+}
+
+std::unique_ptr<sws_t> sws_t::make(int in_width, int in_height, int out_width, int out_height) {
+  cudaDeviceProp props;
+  int device;
+  CU_CHECK_PTR(cudaGetDevice(&device), "Couldn't get cuda device");
+  CU_CHECK_PTR(cudaGetDeviceProperties(&props, device), "Couldn't get cuda device properties");
+
+  auto sws = std::make_unique<sws_t>(in_width, in_height, out_width, out_height, props.maxThreadsPerMultiProcessor / props.maxBlocksPerMultiProcessor / 2);
+
+  if(sws->texture == INVALID_TEXTURE) {
+    return nullptr;
+  }
+
+  return sws;
+}
+
+int sws_t::convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV) {
+  int threadsX = width / 2;
+  int threadsY = height;
+
+  dim3 block(threadsPerBlock, threadsPerBlock);
+  dim3 grid(div_align(threadsX, threadsPerBlock), div_align(threadsY, threadsPerBlock));
+
+  RGBA_to_NV12<<<block, grid>>>(texture, Y, UV, pitchY, pitchUV, width, height);
+
+  return CU_CHECK_IGNORE(cudaGetLastError(), "RGBA_to_NV12 failed");
+}
+
+void sws_t::set_colorspace(std::uint32_t colorspace, std::uint32_t color_range) {
+  color_range = 1;
+  colorspace = 5;
+  video::color_extern_t *color_p;
+  switch(colorspace) {
+  case 5: // SWS_CS_SMPTE170M
+    color_p = &video::colors[0];
+    break;
+  case 1: // SWS_CS_ITU709
+    color_p = &video::colors[2];
+    break;
+  case 9: // SWS_CS_BT2020
+  default:
+    color_p = &video::colors[0];
+  };
+
+  if(color_range > 1) {
+    // Full range
+    ++color_p;
+  }
+
+  auto color_matrix = *(video::color_t*)color_p;
+  color_matrix.color_vec_y.w *= 256.0f;
+  color_matrix.color_vec_u.w *= 256.0f;
+  color_matrix.color_vec_v.w *= 256.0f;
+
+  color_matrix.range_y.y *= 256.0f;
+  color_matrix.range_uv.y *= 256.0f;
+
+  static_assert(sizeof(video::color_t) == sizeof(video::color_extern_t), "color matrix struct mismatch");
+
+  CU_CHECK_IGNORE(cudaMemcpyToSymbol(color, &color_matrix, sizeof(video::color_t)), "Couldn't copy color matrix to cuda");
+}
+
+int sws_t::load_ram(platf::img_t &img) {
+  return CU_CHECK_IGNORE(cudaMemcpy2DToArray(array, 0, 0, img.data, img.row_pitch, img.width * img.pixel_pitch, img.height, cudaMemcpyHostToDevice), "Couldn't copy to cuda array");
+}
+
+} // namespace cuda
\ No newline at end of file
diff --git a/sunshine/platform/linux/cuda.h b/sunshine/platform/linux/cuda.h
index a56c961b..41087506 100644
--- a/sunshine/platform/linux/cuda.h
+++ b/sunshine/platform/linux/cuda.h
@@ -1,6 +1,8 @@
 #ifndef SUNSHINE_PLATFORM_CUDA_H
 #define SUNSHINE_PLATFORM_CUDA_H
 
+#ifndef __NVCC__
+
 #include "sunshine/platform/common.h"
 #include "x11grab.h"
 
@@ -9,4 +11,48 @@ std::shared_ptr<platf::hwdevice_t> make_hwdevice(int width, int height, platf::x
 int init();
 } // namespace cuda
 
+#else
+namespace platf {
+class img_t;
+}
+#endif
+
+typedef struct cudaArray *cudaArray_t;
+
+#if !defined(__CUDACC__)
+typedef unsigned long long cudaTextureObject_t;
+#else  /* defined(__CUDACC__) */
+typedef __location__(device_builtin) unsigned long long cudaTextureObject_t;
+#endif /* !defined(__CUDACC__) */
+
+namespace cuda {
+class sws_t {
+public:
+  ~sws_t();
+  sws_t(int in_width, int in_height, int out_width, int out_height, int threadsPerBlock);
+
+  /**
+   * in_width, out_width -- The width and height of the captured image in bytes
+   * out_width, out_height -- the width and height of the NV12 image in pixels
+   * 
+   * cuda_device -- pointer to the cuda device
+   */
+  static std::unique_ptr<sws_t> make(int in_width, int in_height, int out_width, int out_height);
+
+  // Converts loaded image into a CUDevicePtr
+  int convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV);
+
+  void set_colorspace(std::uint32_t colorspace, std::uint32_t color_range);
+
+  int load_ram(platf::img_t &img);
+
+  cudaArray_t array;
+  cudaTextureObject_t texture;
+
+  int width, height;
+
+  int threadsPerBlock;
+};
+} // namespace cuda
+
 #endif
\ No newline at end of file
diff --git a/sunshine/video.cpp b/sunshine/video.cpp
index 7205aa07..3b143cbe 100644
--- a/sunshine/video.cpp
+++ b/sunshine/video.cpp
@@ -324,7 +324,7 @@ struct encoder_t {
 class session_t {
 public:
   session_t() = default;
-  session_t(ctx_t &&ctx, util::wrap_ptr<platf::hwdevice_t> &&device, int inject) : ctx { std::move(ctx) }, device { std::move(device) }, inject { inject } {}
+  session_t(ctx_t &&ctx, std::shared_ptr<platf::hwdevice_t> &&device, int inject) : ctx { std::move(ctx) }, device { std::move(device) }, inject { inject } {}
 
   session_t(session_t &&other) noexcept = default;
 
@@ -342,7 +342,7 @@ public:
   }
 
   ctx_t ctx;
-  util::wrap_ptr<platf::hwdevice_t> device;
+  std::shared_ptr<platf::hwdevice_t> device;
 
   std::vector<packet_raw_t::replace_t> replacements;
 
@@ -369,7 +369,6 @@ struct sync_session_t {
   sync_session_ctx_t *ctx;
 
   platf::img_t *img_tmp;
-  std::shared_ptr<platf::hwdevice_t> hwdevice;
   session_t session;
 };
 
@@ -779,7 +778,7 @@ int encode(int64_t frame_nr, session_t &session, frame_t::pointer frame, safe::m
   return 0;
 }
 
-std::optional<session_t> make_session(const encoder_t &encoder, const config_t &config, int width, int height, platf::hwdevice_t *hwdevice) {
+std::optional<session_t> make_session(const encoder_t &encoder, const config_t &config, int width, int height, std::shared_ptr<platf::hwdevice_t> &&hwdevice) {
   bool hardware = encoder.dev_type != AV_HWDEVICE_TYPE_NONE;
 
   auto &video_format = config.videoFormat == 0 ? encoder.h264 : encoder.hevc;
@@ -886,7 +885,7 @@ std::optional<session_t> make_session(const encoder_t &encoder, const config_t &
   if(hardware) {
     ctx->pix_fmt = encoder.dev_pix_fmt;
 
-    auto buf_or_error = encoder.make_hwdevice_ctx(hwdevice);
+    auto buf_or_error = encoder.make_hwdevice_ctx(hwdevice.get());
     if(buf_or_error.has_right()) {
       return std::nullopt;
     }
@@ -965,7 +964,7 @@ std::optional<session_t> make_session(const encoder_t &encoder, const config_t &
     frame->hw_frames_ctx = av_buffer_ref(ctx->hw_frames_ctx);
   }
 
-  util::wrap_ptr<platf::hwdevice_t> device;
+  std::shared_ptr<platf::hwdevice_t> device;
 
   if(!hwdevice->data) {
     auto device_tmp = std::make_unique<swdevice_t>();
@@ -977,7 +976,7 @@ std::optional<session_t> make_session(const encoder_t &encoder, const config_t &
     device = std::move(device_tmp);
   }
   else {
-    device = hwdevice;
+    device = std::move(hwdevice);
   }
 
   if(device->set_frame(frame.release())) {
@@ -1009,12 +1008,12 @@ void encode_run(
   img_event_t images,
   config_t config,
   int width, int height,
-  platf::hwdevice_t *hwdevice,
+  std::shared_ptr<platf::hwdevice_t> &&hwdevice,
   safe::signal_t &reinit_event,
   const encoder_t &encoder,
   void *channel_data) {
 
-  auto session = make_session(encoder, config, width, height, hwdevice);
+  auto session = make_session(encoder, config, width, height, std::move(hwdevice));
   if(!session) {
     return;
   }
@@ -1101,12 +1100,11 @@ std::optional<sync_session_t> make_synced_session(platf::display_t *disp, const
   // absolute mouse coordinates require that the dimensions of the screen are known
   ctx.touch_port_events->raise(make_port(disp, ctx.config));
 
-  auto session = make_session(encoder, ctx.config, img.width, img.height, hwdevice.get());
+  auto session = make_session(encoder, ctx.config, img.width, img.height, std::move(hwdevice));
   if(!session) {
     return std::nullopt;
   }
 
-  encode_session.hwdevice = std::move(hwdevice);
   encode_session.session  = std::move(*session);
 
   return std::move(encode_session);
@@ -1208,7 +1206,7 @@ encode_e encode_run_sync(
           ctx->idr_events->pop();
         }
 
-        if(pos->hwdevice->convert(*img)) {
+        if(pos->session.device->convert(*img)) {
           BOOST_LOG(error) << "Could not convert image"sv;
           ctx->shutdown_event->raise(true);
 
@@ -1356,7 +1354,7 @@ void capture_async(
       frame_nr,
       mail, images,
       config, display->width, display->height,
-      hwdevice.get(),
+      std::move(hwdevice),
       ref->reinit_event, *ref->encoder_p,
       channel_data);
   }
@@ -1409,7 +1407,7 @@ int validate_config(std::shared_ptr<platf::display_t> &disp, const encoder_t &en
     return -1;
   }
 
-  auto session = make_session(encoder, config, disp->width, disp->height, hwdevice.get());
+  auto session = make_session(encoder, config, disp->width, disp->height, std::move(hwdevice));
   if(!session) {
     return -1;
   }
diff --git a/third-party/nvfbc/NvFBC.h b/third-party/nvfbc/NvFBC.h
new file mode 100644
index 00000000..8990eeab
--- /dev/null
+++ b/third-party/nvfbc/NvFBC.h
@@ -0,0 +1,2006 @@
+/*!
+ * \file
+ *
+ * This file contains the interface constants, structure definitions and
+ * function prototypes defining the NvFBC API for Linux.
+ *
+ * Copyright (c) 2013-2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _NVFBC_H_
+#define _NVFBC_H_
+
+#include <stdint.h>
+
+/*!
+ * \mainpage NVIDIA Framebuffer Capture (NvFBC) for Linux.
+ *
+ * NvFBC is a high performance, low latency API to capture the framebuffer of
+ * an X server screen.
+ *
+ * The output from NvFBC captures everything that would be visible if we were
+ * directly looking at the monitor.  This includes window manager decoration,
+ * mouse cursor, overlay, etc.
+ *
+ * It is ideally suited to desktop or fullscreen application capture and
+ * remoting.
+ */
+
+/*!
+ * \defgroup FBC_REQ Requirements
+ *
+ * The following requirements are provided by the regular NVIDIA Display Driver
+ * package:
+ *
+ * - OpenGL core >= 4.2:
+ *   Required.  NvFBC relies on OpenGL to perform frame capture and
+ *   post-processing.
+ *
+ * - Vulkan 1.1:
+ *   Required.
+ *
+ * - libcuda.so.1 >= 5.5:
+ *   Optional. Used for capture to video memory with CUDA interop.
+ *
+ * The following requirements must be installed separately depending on the
+ * Linux distribution being used:
+ *
+ * - XRandR extension >= 1.2:
+ *   Optional.  Used for RandR output tracking.
+ *
+ * - libX11-xcb.so.1 >= 1.2:
+ *   Required.  NvFBC uses a mix of Xlib and XCB.  Xlib is needed to use GLX,
+ *   XCB is needed to make NvFBC more resilient against X server terminations
+ *   while a capture session is active.
+ *
+ * - libxcb.so.1 >= 1.3:
+ *   Required.  See above.
+ *
+ * - xorg-server >= 1.3:
+ *   Optional.  Required for push model to work properly.
+ *
+ * Note that all optional dependencies are dlopen()'d at runtime.  Failure to
+ * load an optional library is not fatal.
+ */
+
+/*!
+ * \defgroup FBC_CHANGES ChangeLog
+ *
+ * NvFBC Linux API version 0.1
+ * - Initial BETA release.
+ *
+ * NvFBC Linux API version 0.2
+ * - Added 'bEnableMSE' field to NVFBC_H264_HW_ENC_CONFIG.
+ * - Added 'dwMSE' field to NVFBC_TOH264_GRAB_FRAME_PARAMS.
+ * - Added 'bEnableAQ' field to NVFBC_H264_HW_ENC_CONFIG.
+ * - Added 'NVFBC_H264_PRESET_LOSSLESS_HP' enum to NVFBC_H264_PRESET.
+ * - Added 'NVFBC_BUFFER_FORMAT_YUV444P' enum to NVFBC_BUFFER_FORMAT.
+ * - Added 'eInputBufferFormat' field to NVFBC_H264_HW_ENC_CONFIG.
+ * - Added '0' and '244' values for NVFBC_H264_HW_ENC_CONFIG::dwProfile.
+ *
+ * NvFBC Linux API version 0.3
+ * - Improved multi-threaded support by implementing an API locking mechanism.
+ * - Added 'nvFBCBindContext' API entry point.
+ * - Added 'nvFBCReleaseContext' API entry point.
+ *
+ * NvFBC Linux API version 1.0
+ * - Added codec agnostic interface for HW encoding.
+ * - Deprecated H.264 interface.
+ * - Added support for H.265/HEVC HW encoding.
+ *
+ * NvFBC Linux API version 1.1
+ * - Added 'nvFBCToHwGetCaps' API entry point.
+ * - Added 'dwDiffMapScalingFactor' field to NVFBC_TOSYS_SETUP_PARAMS.
+ *
+ * NvFBC Linux API version 1.2
+ * - Deprecated ToHwEnc interface.
+ * - Added ToGL interface that captures frames to an OpenGL texture in video
+ *   memory.
+ * - Added 'bDisableAutoModesetRecovery' field to
+ *   NVFBC_CREATE_CAPTURE_SESSION_PARAMS.
+ * - Added 'bExternallyManagedContext' field to NVFBC_CREATE_HANDLE_PARAMS.
+ *
+ * NvFBC Linux API version 1.3
+ * - Added NVFBC_BUFFER_FORMAT_RGBA
+ * - Added 'dwTimeoutMs' field to NVFBC_TOSYS_GRAB_FRAME_PARAMS,
+ *   NVFBC_TOCUDA_GRAB_FRAME_PARAMS, and NVFBC_TOGL_GRAB_FRAME_PARAMS.
+ *
+ * NvFBC Linux API version 1.4
+ * - Clarified that NVFBC_BUFFER_FORMAT_{ARGB,RGB,RGBA} are byte-order formats.
+ * - Renamed NVFBC_BUFFER_FORMAT_YUV420P to NVFBC_BUFFER_FORMAT_NV12.
+ * - Added new requirements.
+ * - Made NvFBC more resilient against the X server terminating during an active
+ *   capture session.  See new comments for ::NVFBC_ERR_X.
+ * - Relaxed requirement that 'frameSize' must have a width being a multiple of
+ *   4 and a height being a multiple of 2.
+ * - Added 'bRoundFrameSize' field to NVFBC_CREATE_CAPTURE_SESSION_PARAMS.
+ * - Relaxed requirement that the scaling factor for differential maps must be
+ *   a multiple of the size of the frame.
+ * - Added 'diffMapSize' field to NVFBC_TOSYS_SETUP_PARAMS and
+ *   NVFBC_TOGL_SETUP_PARAMS.
+ *
+ * NvFBC Linux API version 1.5
+ * - Added NVFBC_BUFFER_FORMAT_BGRA
+ *
+ * NvFBC Linux API version 1.6
+ * - Added the 'NVFBC_TOSYS_GRAB_FLAGS_NOWAIT_IF_NEW_FRAME_READY',
+ *   'NVFBC_TOCUDA_GRAB_FLAGS_NOWAIT_IF_NEW_FRAME_READY', and
+ *   'NVFBC_TOGL_GRAB_FLAGS_NOWAIT_IF_NEW_FRAME_READY' capture flags.
+ * - Exposed debug and performance logs through the NVFBC_LOG_LEVEL environment
+ *   variable.  Setting it to "1" enables performance logs, setting it to "2"
+ *   enables debugging logs, setting it to "3" enables both.
+ * - Logs are printed to stdout or to the file pointed by the NVFBC_LOG_FILE
+ *   environment variable.
+ * - Added 'ulTimestampUs' to NVFBC_FRAME_GRAB_INFO.
+ * - Added 'dwSamplingRateMs' to NVFBC_CREATE_CAPTURE_SESSION_PARAMS.
+ * - Added 'bPushModel' to NVFBC_CREATE_CAPTURE_SESSION_PARAMS.
+ *
+ * NvFBC Linux API version 1.7
+ * - Retired the NVFBC_CAPTURE_TO_HW_ENCODER interface.
+ *   This interface has been deprecated since NvFBC 1.2 and has received no
+ *   updates or new features since. We recommend using the NVIDIA Video Codec
+ *   SDK to encode NvFBC frames.
+ *   See: https://developer.nvidia.com/nvidia-video-codec-sdk
+ * - Added a 'Capture Modes' section to those headers.
+ * - Added a 'Post Processing' section to those headers.
+ * - Added an 'Environment Variables' section to those headers.
+ * - Added 'bInModeset' to NVFBC_GET_STATUS_PARAMS.
+ * - Added 'bAllowDirectCapture' to NVFBC_CREATE_CAPTURE_SESSION_PARAMS.
+ * - Added 'bDirectCaptured' to NVFBC_FRAME_GRAB_INFO.
+ * - Added 'bRequiredPostProcessing' to NVFBC_FRAME_GRAB_INFO.
+ */
+
+/*!
+ * \defgroup FBC_MODES Capture Modes
+ *
+ * When creating a capture session, NvFBC instantiates a capture subsystem
+ * living in the NVIDIA X driver.
+ *
+ * This subsystem listens for damage events coming from applications then
+ * generates (composites) frames for NvFBC when new content is available.
+ *
+ * This capture server can operate on a timer where it periodically checks if
+ * there are any pending damage events, or it can generate frames as soon as it
+ * receives a new damage event.
+ * See NVFBC_CREATE_CAPTURE_SESSION_PARAMS::dwSamplingRateMs,
+ * and NVFBC_CREATE_CAPTURE_SESSION_PARAMS::bPushModel.
+ *
+ * NvFBC can also attach itself to a fullscreen unoccluded application and have
+ * it copy its frames directly into a buffer owned by NvFBC upon present. This
+ * mode bypasses the X server.
+ * See NVFBC_CREATE_CAPTURE_SESSION_PARAMS::bAllowDirectCapture.
+ *
+ * NvFBC is designed to capture frames with as few copies as possible. The
+ * NVIDIA X driver composites frames directly into the NvFBC buffers, and
+ * direct capture copies frames directly into these buffers as well.
+ *
+ * Depending on the configuration of a capture session, an extra copy (rendering
+ * pass) may be needed. See the 'Post Processing' section.
+ */
+
+/*!
+ * \defgroup FBC_PP Post Processing
+ *
+ * Depending on the configuration of a capture session, NvFBC might require to
+ * do post processing on frames.
+ *
+ * Post processing is required for the following reasons:
+ * - NvFBC needs to do a pixel format conversion.
+ * - Diffmaps are requested.
+ * - Capture to system memory is requested.
+ *
+ * NvFBC needs to do a conversion if the requested pixel format does not match
+ * the native format. The native format is NVFBC_BUFFER_FORMAT_BGRA.
+ *
+ * Note: post processing is *not* required for frame scaling and frame cropping.
+ *
+ * Skipping post processing can reduce capture latency. An application can know
+ * whether post processing was required by checking
+ * NVFBC_FRAME_GRAB_INFO::bRequiredPostProcessing.
+ */
+
+/*!
+ * \defgroup FBC_ENVVAR Environment Variables
+ *
+ * Below are the environment variables supported by NvFBC:
+ *
+ * - NVFBC_LOG_LEVEL
+ *   Bitfield where the first bit enables debug logs and the second bit enables
+ *   performance logs. Both can be enabled by setting this envvar to 3.
+ *
+ * - NVFBC_LOG_FILE
+ *   Write all NvFBC logs to the given file.
+ *
+ * - NVFBC_FORCE_ALLOW_DIRECT_CAPTURE
+ *   Used to override NVFBC_CREATE_CAPTURE_SESSION_PARAMS::bAllowDirectCapture.
+ *
+ * - NVFBC_FORCE_POST_PROCESSING
+ *   Used to force the post processing step, even if it could be skipped.
+ *   See the 'Post Processing' section.
+ */
+
+/*!
+ * \defgroup FBC_STRUCT Structure Definition
+ *
+ * @{
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!
+ * Calling convention.
+ */
+#define NVFBCAPI
+
+/*!
+ * NvFBC API major version.
+ */
+#define NVFBC_VERSION_MAJOR 1
+
+/*!
+ * NvFBC API minor version.
+ */
+#define NVFBC_VERSION_MINOR 7
+
+/*!
+ * NvFBC API version.
+ */
+#define NVFBC_VERSION (uint32_t) (NVFBC_VERSION_MINOR | (NVFBC_VERSION_MAJOR << 8))
+
+/*!
+ * Creates a version number for structure parameters.
+ */
+#define NVFBC_STRUCT_VERSION(typeName, ver) \
+    (uint32_t) (sizeof(typeName) | ((ver) << 16) | (NVFBC_VERSION << 24))
+
+/*!
+ * Defines error codes.
+ *
+ * \see NvFBCGetLastErrorStr
+ */
+typedef enum _NVFBCSTATUS
+{
+    /*!
+     * This indicates that the API call returned with no errors.
+     */
+    NVFBC_SUCCESS             = 0,
+    /*!
+     * This indicates that the API version between the client and the library
+     * is not compatible.
+     */
+    NVFBC_ERR_API_VERSION     = 1,
+    /*!
+     * An internal error occurred.
+     */
+    NVFBC_ERR_INTERNAL        = 2,
+    /*!
+     * This indicates that one or more of the parameter passed to the API call
+     * is invalid.
+     */
+    NVFBC_ERR_INVALID_PARAM   = 3,
+    /*!
+     * This indicates that one or more of the pointers passed to the API call
+     * is invalid.
+     */
+    NVFBC_ERR_INVALID_PTR     = 4,
+    /*!
+     * This indicates that the handle passed to the API call to identify the
+     * client is invalid.
+     */
+    NVFBC_ERR_INVALID_HANDLE  = 5,
+    /*!
+     * This indicates that the maximum number of threaded clients of the same
+     * process has been reached.  The limit is 10 threads per process.
+     * There is no limit on the number of process.
+     */
+    NVFBC_ERR_MAX_CLIENTS     = 6,
+    /*!
+     * This indicates that the requested feature is not currently supported
+     * by the library.
+     */
+    NVFBC_ERR_UNSUPPORTED     = 7,
+    /*!
+     * This indicates that the API call failed because it was unable to allocate
+     * enough memory to perform the requested operation.
+     */
+    NVFBC_ERR_OUT_OF_MEMORY   = 8,
+    /*!
+     * This indicates that the API call was not expected.  This happens when
+     * API calls are performed in a wrong order, such as trying to capture
+     * a frame prior to creating a new capture session; or trying to set up
+     * a capture to video memory although a capture session to system memory
+     * was created.
+     */
+    NVFBC_ERR_BAD_REQUEST     = 9,
+    /*!
+     * This indicates an X error, most likely meaning that the X server has
+     * been terminated.  When this error is returned, the only resort is to
+     * create another FBC handle using NvFBCCreateHandle().
+     *
+     * The previous handle should still be freed with NvFBCDestroyHandle(), but
+     * it might leak resources, in particular X, GLX, and GL resources since
+     * it is no longer possible to communicate with an X server to free them
+     * through the driver.
+     *
+     * The best course of action to eliminate this potential leak is to close
+     * the OpenGL driver, close the forked process running the capture, or
+     * restart the application.
+     */
+    NVFBC_ERR_X               = 10,
+    /*!
+     * This indicates a GLX error.
+     */
+    NVFBC_ERR_GLX             = 11,
+    /*!
+     * This indicates an OpenGL error.
+     */
+    NVFBC_ERR_GL              = 12,
+    /*!
+     * This indicates a CUDA error.
+     */
+    NVFBC_ERR_CUDA            = 13,
+    /*!
+     * This indicates a HW encoder error.
+     */
+    NVFBC_ERR_ENCODER         = 14,
+    /*!
+     * This indicates an NvFBC context error.
+     */
+    NVFBC_ERR_CONTEXT         = 15,
+    /*!
+     * This indicates that the application must recreate the capture session.
+     *
+     * This error can be returned if a modeset event occurred while capturing
+     * frames, and NVFBC_CREATE_HANDLE_PARAMS::bDisableAutoModesetRecovery
+     * was set to NVFBC_TRUE.
+     */
+    NVFBC_ERR_MUST_RECREATE   = 16,
+    /*!
+     * This indicates a Vulkan error.
+     */
+    NVFBC_ERR_VULKAN          = 17,
+} NVFBCSTATUS;
+
+/*!
+ * Defines boolean values.
+ */
+typedef enum _NVFBC_BOOL
+{
+    /*!
+     * False value.
+     */
+    NVFBC_FALSE = 0,
+    /*!
+     * True value.
+     */
+    NVFBC_TRUE,
+} NVFBC_BOOL;
+
+/*!
+ * Maximum size in bytes of an error string.
+ */
+#define NVFBC_ERR_STR_LEN 512
+
+/*!
+ * Capture type.
+ */
+typedef enum _NVFBC_CAPTURE_TYPE
+{
+    /*!
+     * Capture frames to a buffer in system memory.
+     */
+    NVFBC_CAPTURE_TO_SYS = 0,
+    /*!
+     * Capture frames to a CUDA device in video memory.
+     *
+     * Specifying this will dlopen() libcuda.so.1 and fail if not available.
+     */
+    NVFBC_CAPTURE_SHARED_CUDA,
+    /*!
+     * Retired. Do not use.
+     */
+    /* NVFBC_CAPTURE_TO_HW_ENCODER, */
+    /*!
+     * Capture frames to an OpenGL buffer in video memory.
+     */
+    NVFBC_CAPTURE_TO_GL = 3,
+} NVFBC_CAPTURE_TYPE;
+
+/*!
+ * Tracking type.
+ *
+ * NvFBC can track a specific region of the framebuffer to capture.
+ *
+ * An X screen corresponds to the entire framebuffer.
+ *
+ * An RandR CRTC is a component of the GPU that reads pixels from a region of
+ * the X screen and sends them through a pipeline to an RandR output.
+ * A physical monitor can be connected to an RandR output.  Tracking an RandR
+ * output captures the region of the X screen that the RandR CRTC is sending to
+ * the RandR output.
+ */
+typedef enum
+{
+    /*!
+     * By default, NvFBC tries to track a connected primary output.  If none is
+     * found, then it tries to track the first connected output.  If none is
+     * found then it tracks the entire X screen.
+     *
+     * If the XRandR extension is not available, this option has the same effect
+     * as ::NVFBC_TRACKING_SCREEN.
+     *
+     * This default behavior might be subject to changes in the future.
+     */
+    NVFBC_TRACKING_DEFAULT = 0,
+    /*!
+     * Track an RandR output specified by its ID in the appropriate field.
+     *
+     * The list of connected outputs can be queried via NvFBCGetStatus().
+     * This list can also be obtained using e.g., xrandr(1).
+     *
+     * If the XRandR extension is not available, setting this option returns an
+     * error.
+     */
+    NVFBC_TRACKING_OUTPUT,
+    /*!
+     * Track the entire X screen.
+     */
+    NVFBC_TRACKING_SCREEN,
+} NVFBC_TRACKING_TYPE;
+
+/*!
+ * Buffer format.
+ */
+typedef enum _NVFBC_BUFFER_FORMAT
+{
+    /*!
+     * Data will be converted to ARGB8888 byte-order format. 32 bpp.
+     */
+    NVFBC_BUFFER_FORMAT_ARGB = 0,
+    /*!
+     * Data will be converted to RGB888 byte-order format. 24 bpp.
+     */
+    NVFBC_BUFFER_FORMAT_RGB,
+    /*!
+     * Data will be converted to NV12 format using HDTV weights
+     * according to ITU-R BT.709.  12 bpp.
+     */
+    NVFBC_BUFFER_FORMAT_NV12,
+    /*!
+     * Data will be converted to YUV 444 planar format using HDTV weights
+     * according to ITU-R BT.709.  24 bpp
+     */
+    NVFBC_BUFFER_FORMAT_YUV444P,
+    /*!
+     * Data will be converted to RGBA8888 byte-order format. 32 bpp.
+     */
+    NVFBC_BUFFER_FORMAT_RGBA,
+    /*!
+     * Native format. No pixel conversion needed.
+     * BGRA8888 byte-order format. 32 bpp.
+     */
+    NVFBC_BUFFER_FORMAT_BGRA,
+} NVFBC_BUFFER_FORMAT;
+
+#define NVFBC_BUFFER_FORMAT_YUV420P NVFBC_BUFFER_FORMAT_NV12
+
+/*!
+ * Handle used to identify an NvFBC session.
+ */
+typedef uint64_t NVFBC_SESSION_HANDLE;
+
+/*!
+ * Box used to describe an area of the tracked region to capture.
+ *
+ * The coordinates are relative to the tracked region.
+ *
+ * E.g., if the size of the X screen is 3520x1200 and the tracked RandR output
+ * scans a region of 1600x1200+1920+0, then setting a capture box of
+ * 800x600+100+50 effectively captures a region of 800x600+2020+50 relative to
+ * the X screen.
+ */
+typedef struct _NVFBC_BOX
+{
+    /*!
+     * [in] X offset of the box.
+     */
+    uint32_t x;
+    /*!
+     * [in] Y offset of the box.
+     */
+    uint32_t y;
+    /*!
+     * [in] Width of the box.
+     */
+    uint32_t w;
+    /*!
+     * [in] Height of the box.
+     */
+    uint32_t h;
+} NVFBC_BOX;
+
+/*!
+ * Size used to describe the size of a frame.
+ */
+typedef struct _NVFBC_SIZE
+{
+    /*!
+     * [in] Width.
+     */
+    uint32_t w;
+    /*!
+     * [in] Height.
+     */
+    uint32_t h;
+} NVFBC_SIZE;
+
+/*!
+ * Describes information about a captured frame.
+ */
+typedef struct _NVFBC_FRAME_GRAB_INFO
+{
+    /*!
+     * [out] Width of the captured frame.
+     */
+    uint32_t dwWidth;
+    /*!
+     * [out] Height of the captured frame.
+     */
+    uint32_t dwHeight;
+    /*!
+     * [out] Size of the frame in bytes.
+     */
+    uint32_t dwByteSize;
+    /*!
+     * [out] Incremental ID of the current frame.
+     *
+     * This can be used to identify a frame.
+     */
+    uint32_t dwCurrentFrame;
+    /*!
+     * [out] Whether the captured frame is a new frame.
+     *
+     * When using non blocking calls it is possible to capture a frame
+     * that was already captured before if the display server did not
+     * render a new frame in the meantime.  In that case, this flag
+     * will be set to NVFBC_FALSE.
+     *
+     * When using blocking calls each captured frame will have
+     * this flag set to NVFBC_TRUE since the blocking mechanism waits for
+     * the display server to render a new frame.
+     *
+     * Note that this flag does not guarantee that the content of
+     * the frame will be different compared to the previous captured frame.
+     *
+     * In particular, some compositing managers report the entire
+     * framebuffer as damaged when an application refreshes its content.
+     *
+     * Consider a single X screen spanned across physical displays A and B
+     * and an NvFBC application tracking display A.  Depending on the
+     * compositing manager, it is possible that an application refreshing
+     * itself on display B will trigger a frame capture on display A.
+     *
+     * Workarounds include:
+     * - Using separate X screens
+     * - Disabling the composite extension
+     * - Using a compositing manager that properly reports what regions
+     *   are damaged
+     * - Using NvFBC's diffmaps to find out if the frame changed
+     */
+    NVFBC_BOOL bIsNewFrame;
+    /*!
+     * [out] Frame timestamp
+     *
+     * Time in micro seconds when the display server started rendering the
+     * frame.
+     *
+     * This does not account for when the frame was captured.  If capturing an
+     * old frame (e.g., bIsNewFrame is NVFBC_FALSE) the reported timestamp
+     * will reflect the time when the old frame was rendered by the display
+     * server.
+     */
+    uint64_t ulTimestampUs;
+    /*
+     * [out] Number of frames generated since the last capture.
+     *
+     * This can help applications tell whether they missed frames or there
+     * were no frames generated by the server since the last capture.
+     */
+    uint32_t dwMissedFrames;
+    /*
+     * [out] Whether the captured frame required post processing.
+     *
+     * See the 'Post Processing' section.
+     */
+    NVFBC_BOOL bRequiredPostProcessing;
+    /*
+     * [out] Whether this frame was obtained via direct capture.
+     *
+     * See NVFBC_CREATE_CAPTURE_SESSION_PARAMS::bAllowDirectCapture.
+     */
+    NVFBC_BOOL bDirectCapture;
+} NVFBC_FRAME_GRAB_INFO;
+
+/*!
+ * Defines parameters for the CreateHandle() API call.
+ */
+typedef struct _NVFBC_CREATE_HANDLE_PARAMS
+{
+    /*!
+     * [in] Must be set to NVFBC_CREATE_HANDLE_PARAMS_VER
+     */
+    uint32_t dwVersion;
+    /*!
+     * [in] Application specific private information passed to the NvFBC
+     * session.
+     */
+    const void *privateData;
+    /*!
+     * [in] Size of the application specific private information passed to the
+     * NvFBC session.
+     */
+    uint32_t privateDataSize;
+    /*!
+     * [in] Whether NvFBC should not create and manage its own graphics context
+     *
+     * NvFBC internally uses OpenGL to perfom graphics operations on the
+     * captured frames.  By default, NvFBC will create and manage (e.g., make
+     * current, detect new threads, etc.) its own OpenGL context.
+     *
+     * If set to NVFBC_TRUE, NvFBC will use the application's context.  It will
+     * be the application's responsibility to make sure that a context is
+     * current on the thread calling into the NvFBC API.
+     */
+    NVFBC_BOOL bExternallyManagedContext;
+    /*!
+     * [in] GLX context
+     *
+     * GLX context that NvFBC should use internally to create pixmaps and
+     * make them current when creating a new capture session.
+     *
+     * Note: NvFBC expects a context created against a GLX_RGBA_TYPE render
+     * type.
+     */
+    void *glxCtx;
+    /*!
+     * [in] GLX framebuffer configuration
+     *
+     * Framebuffer configuration that was used to create the GLX context, and
+     * that will be used to create pixmaps internally.
+     *
+     * Note: NvFBC expects a configuration having at least the following
+     * attributes:
+     *  GLX_DRAWABLE_TYPE, GLX_PIXMAP_BIT
+     *  GLX_BIND_TO_TEXTURE_RGBA_EXT, 1
+     *  GLX_BIND_TO_TEXTURE_TARGETS_EXT, GLX_TEXTURE_2D_BIT_EXT
+     */
+    void *glxFBConfig;
+} NVFBC_CREATE_HANDLE_PARAMS;
+
+/*!
+ * NVFBC_CREATE_HANDLE_PARAMS structure version.
+ */
+#define NVFBC_CREATE_HANDLE_PARAMS_VER NVFBC_STRUCT_VERSION(NVFBC_CREATE_HANDLE_PARAMS, 2)
+
+/*!
+ * Defines parameters for the ::NvFBCDestroyHandle() API call.
+ */
+typedef struct _NVFBC_DESTROY_HANDLE_PARAMS
+{
+    /*!
+     * [in] Must be set to NVFBC_DESTROY_HANDLE_PARAMS_VER
+     */
+    uint32_t dwVersion;
+} NVFBC_DESTROY_HANDLE_PARAMS;
+
+/*!
+ * NVFBC_DESTROY_HANDLE_PARAMS structure version.
+ */
+#define NVFBC_DESTROY_HANDLE_PARAMS_VER NVFBC_STRUCT_VERSION(NVFBC_DESTROY_HANDLE_PARAMS, 1)
+
+/*!
+ * Maximum number of connected RandR outputs to an X screen.
+ */
+#define NVFBC_OUTPUT_MAX 5
+
+/*!
+ * Maximum size in bytes of an RandR output name.
+ */
+#define NVFBC_OUTPUT_NAME_LEN 128
+
+/*!
+ * Describes an RandR output.
+ *
+ * Filling this structure relies on the XRandR extension.  This feature cannot
+ * be used if the extension is missing or its version is below the requirements.
+ *
+ * \see Requirements
+ */
+typedef struct _NVFBC_OUTPUT
+{
+    /*!
+     * Identifier of the RandR output.
+     */
+    uint32_t dwId;
+    /*!
+     * Name of the RandR output, as reported by tools such as xrandr(1).
+     *
+     * Example: "DVI-I-0"
+     */
+    char name[NVFBC_OUTPUT_NAME_LEN];
+    /*!
+     * Region of the X screen tracked by the RandR CRTC driving this RandR
+     * output.
+     */
+    NVFBC_BOX trackedBox;
+} NVFBC_RANDR_OUTPUT_INFO;
+
+/*!
+ * Defines parameters for the ::NvFBCGetStatus() API call.
+ */
+typedef struct _NVFBC_GET_STATUS_PARAMS
+{
+    /*!
+     * [in] Must be set to NVFBC_GET_STATUS_PARAMS_VER
+     */
+    uint32_t dwVersion;
+    /*!
+     * [out] Whether or not framebuffer capture is supported by the graphics
+     * driver.
+     */
+    NVFBC_BOOL bIsCapturePossible;
+    /*!
+     * [out] Whether or not there is already a capture session on this system.
+     */
+    NVFBC_BOOL bCurrentlyCapturing;
+    /*!
+     * [out] Whether or not it is possible to create a capture session on this
+     * system.
+     */
+    NVFBC_BOOL bCanCreateNow;
+    /*!
+     * [out] Size of the X screen (framebuffer).
+     */
+    NVFBC_SIZE screenSize;
+    /*!
+     * [out] Whether the XRandR extension is available.
+     *
+     * If this extension is not available then it is not possible to have
+     * information about RandR outputs.
+     */
+    NVFBC_BOOL bXRandRAvailable;
+    /*!
+     * [out] Array of outputs connected to the X screen.
+     *
+     * An application can track a specific output by specifying its ID when
+     * creating a capture session.
+     *
+     * Only if XRandR is available.
+     */
+    NVFBC_RANDR_OUTPUT_INFO outputs[NVFBC_OUTPUT_MAX];
+    /*!
+     * [out] Number of outputs connected to the X screen.
+     *
+     * This must be used to parse the array of connected outputs.
+     *
+     * Only if XRandR is available.
+     */
+    uint32_t dwOutputNum;
+    /*!
+     * [out] Version of the NvFBC library running on this system.
+     */
+    uint32_t dwNvFBCVersion;
+    /*!
+     * [out] Whether the X server is currently in modeset.
+     *
+     * When the X server is in modeset, it must give up all its video
+     * memory allocations. It is not possible to create a capture
+     * session until the modeset is over.
+     *
+     * Note that VT-switches are considered modesets.
+     */
+    NVFBC_BOOL bInModeset;
+} NVFBC_GET_STATUS_PARAMS;
+
+/*!
+ * NVFBC_GET_STATUS_PARAMS structure version.
+ */
+#define NVFBC_GET_STATUS_PARAMS_VER NVFBC_STRUCT_VERSION(NVFBC_GET_STATUS_PARAMS, 2)
+
+/*!
+ * Defines parameters for the ::NvFBCCreateCaptureSession() API call.
+ */
+typedef struct _NVFBC_CREATE_CAPTURE_SESSION_PARAMS
+{
+    /*!
+     * [in] Must be set to NVFBC_CREATE_CAPTURE_SESSION_PARAMS_VER
+     */
+    uint32_t dwVersion;
+    /*!
+     * [in] Desired capture type.
+     *
+     * Note that when specyfing ::NVFBC_CAPTURE_SHARED_CUDA NvFBC will try to
+     * dlopen() the corresponding libraries.  This means that NvFBC can run on
+     * a system without the CUDA library since it does not link against them.
+     */
+    NVFBC_CAPTURE_TYPE eCaptureType;
+    /*!
+     * [in] What region of the framebuffer should be tracked.
+     */
+    NVFBC_TRACKING_TYPE eTrackingType;
+    /*!
+     * [in] ID of the output to track if eTrackingType is set to
+     * ::NVFBC_TRACKING_OUTPUT.
+     */
+    uint32_t dwOutputId;
+    /*!
+     * [in] Crop the tracked region.
+     *
+     * The coordinates are relative to the tracked region.
+     *
+     * It can be set to 0 to capture the entire tracked region.
+     */
+    NVFBC_BOX captureBox;
+    /*!
+     * [in] Desired size of the captured frame.
+     *
+     * This parameter allow to scale the captured frame.
+     *
+     * It can be set to 0 to disable frame resizing.
+     */
+    NVFBC_SIZE frameSize;
+    /*!
+     * [in] Whether the mouse cursor should be composited to the frame.
+     *
+     * Disabling the cursor will not generate new frames when only the cursor
+     * is moved.
+     */
+    NVFBC_BOOL bWithCursor;
+    /*!
+     * [in] Whether NvFBC should not attempt to recover from modesets.
+     *
+     * NvFBC is able to detect when a modeset event occured and can automatically
+     * re-create a capture session with the same settings as before, then resume
+     * its frame capture session transparently.
+     *
+     * This option allows to disable this behavior.  NVFBC_ERR_MUST_RECREATE
+     * will be returned in that case.
+     *
+     * It can be useful in the cases when an application needs to do some work
+     * between setting up a capture and grabbing the first frame.
+     *
+     * For example: an application using the ToGL interface needs to register
+     * resources with EncodeAPI prior to encoding frames.
+     *
+     * Note that during modeset recovery, NvFBC will try to re-create the
+     * capture session every second until it succeeds.
+     */
+    NVFBC_BOOL bDisableAutoModesetRecovery;
+    /*!
+     * [in] Whether NvFBC should round the requested frameSize.
+     *
+     * When disabled, NvFBC will not attempt to round the requested resolution.
+     *
+     * However, some pixel formats have resolution requirements.  E.g., YUV/NV
+     * formats must have a width being a multiple of 4, and a height being a
+     * multiple of 2.  RGB formats don't have such requirements.
+     *
+     * If the resolution doesn't meet the requirements of the format, then NvFBC
+     * will fail at setup time.
+     *
+     * When enabled, NvFBC will round the requested width to the next multiple
+     * of 4 and the requested height to the next multiple of 2.
+     *
+     * In this case, requesting any resolution will always work with every
+     * format.  However, an NvFBC client must be prepared to handle the case
+     * where the requested resolution is different than the captured resolution.
+     *
+     * NVFBC_FRAME_GRAB_INFO::dwWidth and NVFBC_FRAME_GRAB_INFO::dwHeight should
+     * always be used for getting information about captured frames.
+     */
+    NVFBC_BOOL bRoundFrameSize;
+    /*!
+     * [in] Rate in ms at which the display server generates new frames
+     *
+     * This controls the frequency at which the display server will generate
+     * new frames if new content is available.  This effectively controls the
+     * capture rate when using blocking calls.
+     *
+     * Note that lower values will increase the CPU and GPU loads.
+     *
+     * The default value is 16ms (~ 60 Hz).
+     */
+    uint32_t dwSamplingRateMs;
+    /*!
+     * [in] Enable push model for frame capture
+     *
+     * When set to NVFBC_TRUE, the display server will generate frames whenever
+     * it receives a damage event from applications.
+     *
+     * Setting this to NVFBC_TRUE will ignore ::dwSamplingRateMs.
+     *
+     * Using push model with the NVFBC_*_GRAB_FLAGS_NOWAIT_IF_NEW_FRAME_READY
+     * capture flag should guarantee the shortest amount of time between an
+     * application rendering a frame and an NvFBC client capturing it, provided
+     * that the NvFBC client is able to process the frames quickly enough.
+     *
+     * Note that applications running at high frame rates will increase CPU and
+     * GPU loads.
+     */
+    NVFBC_BOOL bPushModel;
+    /*!
+     * [in] Allow direct capture
+     *
+     * Direct capture allows NvFBC to attach itself to a fullscreen graphics
+     * application. Whenever that application presents a frame, it makes a copy
+     * of it directly into a buffer owned by NvFBC thus bypassing the X server.
+     *
+     * When direct capture is *not* enabled, the NVIDIA X driver generates a
+     * frame for NvFBC when it receives a damage event from an application if push
+     * model is enabled, or periodically checks if there are any pending damage
+     * events otherwise (see NVFBC_CREATE_CAPTURE_SESSION_PARAMS::dwSamplingRateMs).
+     *
+     * Direct capture is possible under the following conditions:
+     * - Direct capture is allowed
+     * - Push model is enabled (see NVFBC_CREATE_CAPTURE_SESSION_PARAMS::bPushModel)
+     * - The mouse cursor is not composited (see NVFBC_CREATE_CAPTURE_SESSION_PARAMS::bWithCursor)
+     * - No viewport transformation is required. This happens when the remote
+     *   desktop is e.g. rotated.
+     *
+     * When direct capture is possible, NvFBC will automatically attach itself
+     * to a fullscreen unoccluded application, if such exists.
+     *
+     * Notes:
+     * - This includes compositing desktops such as GNOME (e.g., gnome-shell
+     *   is the fullscreen unoccluded application).
+     * - There can be only one fullscreen unoccluded application at a time.
+     * - The NVIDIA X driver monitors which application qualifies or no
+     *   longer qualifies.
+     *
+     * For example, if a fullscreen application is launched in GNOME, NvFBC will
+     * detach from gnome-shell and attach to that application.
+     *
+     * Attaching and detaching happens automatically from the perspective of an
+     * NvFBC client. When detaching from an application, the X driver will
+     * transparently resume generating frames for NvFBC.
+     *
+     * An application can know whether a given frame was obtained through
+     * direct capture by checking NVFBC_FRAME_GRAB_INFO::bDirectCapture.
+     */
+    NVFBC_BOOL bAllowDirectCapture;
+} NVFBC_CREATE_CAPTURE_SESSION_PARAMS;
+
+/*!
+ * NVFBC_CREATE_CAPTURE_SESSION_PARAMS structure version.
+ */
+#define NVFBC_CREATE_CAPTURE_SESSION_PARAMS_VER NVFBC_STRUCT_VERSION(NVFBC_CREATE_CAPTURE_SESSION_PARAMS, 6)
+
+/*!
+ * Defines parameters for the ::NvFBCDestroyCaptureSession() API call.
+ */
+typedef struct _NVFBC_DESTROY_CAPTURE_SESSION_PARAMS
+{
+    /*!
+     * [in] Must be set to NVFBC_DESTROY_CAPTURE_SESSION_PARAMS_VER
+     */
+    uint32_t dwVersion;
+} NVFBC_DESTROY_CAPTURE_SESSION_PARAMS;
+
+/*!
+ * NVFBC_DESTROY_CAPTURE_SESSION_PARAMS structure version.
+ */
+#define NVFBC_DESTROY_CAPTURE_SESSION_PARAMS_VER NVFBC_STRUCT_VERSION(NVFBC_DESTROY_CAPTURE_SESSION_PARAMS, 1)
+
+/*!
+ * Defines parameters for the ::NvFBCBindContext() API call.
+ */
+typedef struct _NVFBC_BIND_CONTEXT_PARAMS
+{
+    /*!
+     * [in] Must be set to NVFBC_BIND_CONTEXT_PARAMS_VER
+     */
+    uint32_t dwVersion;
+} NVFBC_BIND_CONTEXT_PARAMS;
+
+/*!
+ * NVFBC_BIND_CONTEXT_PARAMS structure version.
+ */
+#define NVFBC_BIND_CONTEXT_PARAMS_VER NVFBC_STRUCT_VERSION(NVFBC_BIND_CONTEXT_PARAMS, 1)
+
+/*!
+ * Defines parameters for the ::NvFBCReleaseContext() API call.
+ */
+typedef struct _NVFBC_RELEASE_CONTEXT_PARAMS
+{
+    /*!
+     * [in] Must be set to NVFBC_RELEASE_CONTEXT_PARAMS_VER
+     */
+    uint32_t dwVersion;
+} NVFBC_RELEASE_CONTEXT_PARAMS;
+
+/*!
+ * NVFBC_RELEASE_CONTEXT_PARAMS structure version.
+ */
+#define NVFBC_RELEASE_CONTEXT_PARAMS_VER NVFBC_STRUCT_VERSION(NVFBC_RELEASE_CONTEXT_PARAMS, 1)
+
+/*!
+ * Defines flags that can be used when capturing to system memory.
+ */
+typedef enum
+{
+    /*!
+     * Default, capturing waits for a new frame or mouse move.
+     *
+     * The default behavior of blocking grabs is to wait for a new frame until
+     * after the call was made.  But it's possible that there is a frame already
+     * ready that the client hasn't seen.
+     * \see NVFBC_TOSYS_GRAB_FLAGS_NOWAIT_IF_NEW_FRAME_READY
+     */
+    NVFBC_TOSYS_GRAB_FLAGS_NOFLAGS       = 0,
+    /*!
+     * Capturing does not wait for a new frame nor a mouse move.
+     *
+     * It is therefore possible to capture the same frame multiple times.
+     * When this occurs, the dwCurrentFrame parameter of the
+     * NVFBC_FRAME_GRAB_INFO structure is not incremented.
+     */
+    NVFBC_TOSYS_GRAB_FLAGS_NOWAIT        = (1 << 0),
+    /*!
+     * Forces the destination buffer to be refreshed even if the frame has not
+     * changed since previous capture.
+     *
+     * By default, if the captured frame is identical to the previous one, NvFBC
+     * will omit one copy and not update the destination buffer.
+     *
+     * Setting that flag will prevent this behavior.  This can be useful e.g.,
+     * if the application has modified the buffer in the meantime.
+     */
+    NVFBC_TOSYS_GRAB_FLAGS_FORCE_REFRESH = (1 << 1),
+    /*!
+     * Similar to NVFBC_TOSYS_GRAB_FLAGS_NOFLAGS, except that the capture will
+     * not wait if there is already a frame available that the client has
+     * never seen yet.
+     */
+    NVFBC_TOSYS_GRAB_FLAGS_NOWAIT_IF_NEW_FRAME_READY = (1 << 2),
+} NVFBC_TOSYS_GRAB_FLAGS;
+
+/*!
+ * Defines parameters for the ::NvFBCToSysSetUp() API call.
+ */
+typedef struct _NVFBC_TOSYS_SETUP_PARAMS
+{
+    /*!
+     * [in] Must be set to NVFBC_TOSYS_SETUP_PARAMS_VER
+     */
+    uint32_t dwVersion;
+    /*!
+     * [in] Desired buffer format.
+     */
+    NVFBC_BUFFER_FORMAT eBufferFormat;
+    /*!
+     * [out] Pointer to a pointer to a buffer in system memory.
+     *
+     * This buffer contains the pixel value of the requested format.  Refer to
+     * the description of the buffer formats to understand the memory layout.
+     *
+     * The application does not need to allocate memory for this buffer.  It
+     * should not free this buffer either.  This buffer is automatically
+     * re-allocated when needed (e.g., when the resolution changes).
+     *
+     * This buffer is allocated by the NvFBC library to the proper size.  This
+     * size is returned in the dwByteSize field of the
+     * ::NVFBC_FRAME_GRAB_INFO structure.
+     */
+    void **ppBuffer;
+    /*!
+     * [in] Whether differential maps should be generated.
+     */
+    NVFBC_BOOL bWithDiffMap;
+    /*!
+     * [out] Pointer to a pointer to a buffer in system memory.
+     *
+     * This buffer contains the differential map of two frames.  It must be read
+     * as an array of unsigned char.  Each unsigned char is either 0 or
+     * non-zero.  0 means that the pixel value at the given location has not
+     * changed since the previous captured frame.  Non-zero means that the pixel
+     * value has changed.
+     *
+     * The application does not need to allocate memory for this buffer.  It
+     * should not free this buffer either.  This buffer is automatically
+     * re-allocated when needed (e.g., when the resolution changes).
+     *
+     * This buffer is allocated by the NvFBC library to the proper size.  The
+     * size of the differential map is returned in ::diffMapSize.
+     *
+     * This option is not compatible with the ::NVFBC_BUFFER_FORMAT_YUV420P and
+     * ::NVFBC_BUFFER_FORMAT_YUV444P buffer formats.
+     */
+    void **ppDiffMap;
+    /*!
+     * [in] Scaling factor of the differential maps.
+     *
+     * For example, a scaling factor of 16 means that one pixel of the diffmap
+     * will represent 16x16 pixels of the original frames.
+     *
+     * If any of these 16x16 pixels is different between the current and the
+     * previous frame, then the corresponding pixel in the diffmap will be set
+     * to non-zero.
+     *
+     * The default scaling factor is 1.  A dwDiffMapScalingFactor of 0 will be
+     * set to 1.
+     */
+    uint32_t dwDiffMapScalingFactor;
+    /*!
+     * [out] Size of the differential map.
+     *
+     * Only set if bWithDiffMap is set to NVFBC_TRUE.
+     */
+    NVFBC_SIZE diffMapSize;
+} NVFBC_TOSYS_SETUP_PARAMS;
+
+/*!
+ * NVFBC_TOSYS_SETUP_PARAMS structure version.
+ */
+#define NVFBC_TOSYS_SETUP_PARAMS_VER NVFBC_STRUCT_VERSION(NVFBC_TOSYS_SETUP_PARAMS, 3)
+
+/*!
+ * Defines parameters for the ::NvFBCToSysGrabFrame() API call.
+ */
+typedef struct _NVFBC_TOSYS_GRAB_FRAME_PARAMS
+{
+    /*!
+     * [in] Must be set to NVFBC_TOSYS_GRAB_FRAME_PARAMS_VER
+     */
+    uint32_t dwVersion;
+    /*!
+     * [in] Flags defining the behavior of this frame capture.
+     */
+    uint32_t dwFlags;
+    /*!
+     * [out] Information about the captured frame.
+     *
+     * Can be NULL.
+     */
+    NVFBC_FRAME_GRAB_INFO *pFrameGrabInfo;
+    /*!
+     * [in] Wait timeout in milliseconds.
+     *
+     * When capturing frames with the NVFBC_TOSYS_GRAB_FLAGS_NOFLAGS or
+     * NVFBC_TOSYS_GRAB_FLAGS_NOWAIT_IF_NEW_FRAME_READY flags,
+     * NvFBC will wait for a new frame or mouse move until the below timer
+     * expires.
+     *
+     * When timing out, the last captured frame will be returned.  Note that as
+     * long as the NVFBC_TOSYS_GRAB_FLAGS_FORCE_REFRESH flag is not set,
+     * returning an old frame will incur no performance penalty.
+     *
+     * NvFBC clients can use the return value of the grab frame operation to
+     * find out whether a new frame was captured, or the timer expired.
+     *
+     * Note that the behavior of blocking calls is to wait for a new frame
+     * *after* the call has been made.  When using timeouts, it is possible
+     * that NvFBC will return a new frame (e.g., it has never been captured
+     * before) even though no new frame was generated after the grab call.
+     *
+     * For the precise definition of what constitutes a new frame, see
+     * ::bIsNewFrame.
+     *
+     * Set to 0 to disable timeouts.
+     */
+    uint32_t dwTimeoutMs;
+} NVFBC_TOSYS_GRAB_FRAME_PARAMS;
+
+/*!
+ * NVFBC_TOSYS_GRAB_FRAME_PARAMS structure version.
+ */
+#define NVFBC_TOSYS_GRAB_FRAME_PARAMS_VER NVFBC_STRUCT_VERSION(NVFBC_TOSYS_GRAB_FRAME_PARAMS, 2)
+
+/*!
+ * Defines flags that can be used when capturing to a CUDA buffer in video memory.
+ */
+typedef enum
+{
+    /*!
+     * Default, capturing waits for a new frame or mouse move.
+     *
+     * The default behavior of blocking grabs is to wait for a new frame until
+     * after the call was made.  But it's possible that there is a frame already
+     * ready that the client hasn't seen.
+     * \see NVFBC_TOCUDA_GRAB_FLAGS_NOWAIT_IF_NEW_FRAME_READY
+     */
+    NVFBC_TOCUDA_GRAB_FLAGS_NOFLAGS      = 0,
+    /*!
+     * Capturing does not wait for a new frame nor a mouse move.
+     *
+     * It is therefore possible to capture the same frame multiple times.
+     * When this occurs, the dwCurrentFrame parameter of the
+     * NVFBC_FRAME_GRAB_INFO structure is not incremented.
+     */
+    NVFBC_TOCUDA_GRAB_FLAGS_NOWAIT       = (1 << 0),
+    /*!
+     * [in] Forces the destination buffer to be refreshed even if the frame
+     * has not changed since previous capture.
+     *
+     * By default, if the captured frame is identical to the previous one, NvFBC
+     * will omit one copy and not update the destination buffer.
+     *
+     * Setting that flag will prevent this behavior.  This can be useful e.g.,
+     * if the application has modified the buffer in the meantime.
+     */
+    NVFBC_TOCUDA_GRAB_FLAGS_FORCE_REFRESH = (1 << 1),
+    /*!
+     * Similar to NVFBC_TOCUDA_GRAB_FLAGS_NOFLAGS, except that the capture will
+     * not wait if there is already a frame available that the client has
+     * never seen yet.
+     */
+    NVFBC_TOCUDA_GRAB_FLAGS_NOWAIT_IF_NEW_FRAME_READY = (1 << 2),
+} NVFBC_TOCUDA_FLAGS;
+
+/*!
+ * Defines parameters for the ::NvFBCToCudaSetUp() API call.
+ */
+typedef struct _NVFBC_TOCUDA_SETUP_PARAMS
+{
+    /*!
+     * [in] Must be set to NVFBC_TOCUDA_SETUP_PARAMS_VER
+     */
+    uint32_t dwVersion;
+    /*!
+     * [in] Desired buffer format.
+     */
+    NVFBC_BUFFER_FORMAT eBufferFormat;
+} NVFBC_TOCUDA_SETUP_PARAMS;
+
+/*!
+ * NVFBC_TOCUDA_SETUP_PARAMS structure version.
+ */
+#define NVFBC_TOCUDA_SETUP_PARAMS_VER NVFBC_STRUCT_VERSION(NVFBC_TOCUDA_SETUP_PARAMS, 1)
+
+/*!
+ * Defines parameters for the ::NvFBCToCudaGrabFrame() API call.
+ */
+typedef struct _NVFBC_TOCUDA_GRAB_FRAME_PARAMS
+{
+    /*!
+     * [in] Must be set to NVFBC_TOCUDA_GRAB_FRAME_PARAMS_VER.
+     */
+    uint32_t dwVersion;
+    /*!
+     * [in] Flags defining the behavior of this frame capture.
+     */
+    uint32_t dwFlags;
+    /*!
+     * [out] Pointer to a ::CUdeviceptr
+     *
+     * The application does not need to allocate memory for this CUDA device.
+     *
+     * The application does need to create its own CUDA context to use this
+     * CUDA device.
+     *
+     * This ::CUdeviceptr will be mapped to a segment in video memory containing
+     * the frame.  It is not possible to process a CUDA device while capturing
+     * a new frame.  If the application wants to do so, it must copy the CUDA
+     * device using ::cuMemcpyDtoD or ::cuMemcpyDtoH beforehand.
+     */
+    void *pCUDADeviceBuffer;
+    /*!
+     * [out] Information about the captured frame.
+     *
+     * Can be NULL.
+     */
+    NVFBC_FRAME_GRAB_INFO *pFrameGrabInfo;
+    /*!
+     * [in] Wait timeout in milliseconds.
+     *
+     * When capturing frames with the NVFBC_TOCUDA_GRAB_FLAGS_NOFLAGS or
+     * NVFBC_TOCUDA_GRAB_FLAGS_NOWAIT_IF_NEW_FRAME_READY flags,
+     * NvFBC will wait for a new frame or mouse move until the below timer
+     * expires.
+     *
+     * When timing out, the last captured frame will be returned.  Note that as
+     * long as the NVFBC_TOCUDA_GRAB_FLAGS_FORCE_REFRESH flag is not set,
+     * returning an old frame will incur no performance penalty.
+     *
+     * NvFBC clients can use the return value of the grab frame operation to
+     * find out whether a new frame was captured, or the timer expired.
+     *
+     * Note that the behavior of blocking calls is to wait for a new frame
+     * *after* the call has been made.  When using timeouts, it is possible
+     * that NvFBC will return a new frame (e.g., it has never been captured
+     * before) even though no new frame was generated after the grab call.
+     *
+     * For the precise definition of what constitutes a new frame, see
+     * ::bIsNewFrame.
+     *
+     * Set to 0 to disable timeouts.
+     */
+    uint32_t dwTimeoutMs;
+} NVFBC_TOCUDA_GRAB_FRAME_PARAMS;
+
+/*!
+ * NVFBC_TOCUDA_GRAB_FRAME_PARAMS structure version.
+ */
+#define NVFBC_TOCUDA_GRAB_FRAME_PARAMS_VER NVFBC_STRUCT_VERSION(NVFBC_TOCUDA_GRAB_FRAME_PARAMS, 2)
+
+/*!
+ * Defines flags that can be used when capturing to an OpenGL buffer in video memory.
+ */
+typedef enum
+{
+    /*!
+     * Default, capturing waits for a new frame or mouse move.
+     *
+     * The default behavior of blocking grabs is to wait for a new frame until
+     * after the call was made.  But it's possible that there is a frame already
+     * ready that the client hasn't seen.
+     * \see NVFBC_TOGL_GRAB_FLAGS_NOWAIT_IF_NEW_FRAME_READY
+     */
+    NVFBC_TOGL_GRAB_FLAGS_NOFLAGS      = 0,
+    /*!
+     * Capturing does not wait for a new frame nor a mouse move.
+     *
+     * It is therefore possible to capture the same frame multiple times.
+     * When this occurs, the dwCurrentFrame parameter of the
+     * NVFBC_FRAME_GRAB_INFO structure is not incremented.
+     */
+    NVFBC_TOGL_GRAB_FLAGS_NOWAIT       = (1 << 0),
+    /*!
+     * [in] Forces the destination buffer to be refreshed even if the frame
+     * has not changed since previous capture.
+     *
+     * By default, if the captured frame is identical to the previous one, NvFBC
+     * will omit one copy and not update the destination buffer.
+     *
+     * Setting that flag will prevent this behavior.  This can be useful e.g.,
+     * if the application has modified the buffer in the meantime.
+     */
+    NVFBC_TOGL_GRAB_FLAGS_FORCE_REFRESH = (1 << 1),
+    /*!
+     * Similar to NVFBC_TOGL_GRAB_FLAGS_NOFLAGS, except that the capture will
+     * not wait if there is already a frame available that the client has
+     * never seen yet.
+     */
+    NVFBC_TOGL_GRAB_FLAGS_NOWAIT_IF_NEW_FRAME_READY = (1 << 2),
+} NVFBC_TOGL_FLAGS;
+
+/*!
+ * Maximum number of GL textures that can be used to store frames.
+ */
+#define NVFBC_TOGL_TEXTURES_MAX 2
+
+/*!
+ * Defines parameters for the ::NvFBCToGLSetUp() API call.
+ */
+typedef struct _NVFBC_TOGL_SETUP_PARAMS
+{
+    /*!
+     * [in] Must be set to NVFBC_TOGL_SETUP_PARAMS_VER
+     */
+    uint32_t dwVersion;
+    /*!
+     * [in] Desired buffer format.
+     */
+    NVFBC_BUFFER_FORMAT eBufferFormat;
+    /*!
+     * [in] Whether differential maps should be generated.
+     */
+    NVFBC_BOOL bWithDiffMap;
+    /*!
+     * [out] Pointer to a pointer to a buffer in system memory.
+     *
+     * \see NVFBC_TOSYS_SETUP_PARAMS::ppDiffMap
+     */
+    void **ppDiffMap;
+    /*!
+     * [in] Scaling factor of the differential maps.
+     *
+     * \see NVFBC_TOSYS_SETUP_PARAMS::dwDiffMapScalingFactor
+     */
+    uint32_t dwDiffMapScalingFactor;
+    /*!
+     * [out] List of GL textures that will store the captured frames.
+     *
+     * This array is 0 terminated.  The number of textures varies depending on
+     * the capture settings (such as whether diffmaps are enabled).
+     *
+     * An application wishing to interop with, for example, EncodeAPI will need
+     * to register these textures prior to start encoding frames.
+     *
+     * After each frame capture, the texture holding the current frame will be
+     * returned in NVFBC_TOGL_GRAB_FRAME_PARAMS::dwTexture.
+     */
+    uint32_t dwTextures[NVFBC_TOGL_TEXTURES_MAX];
+    /*!
+     * [out] GL target to which the texture should be bound.
+     */
+    uint32_t dwTexTarget;
+    /*!
+     * [out] GL format of the textures.
+     */
+    uint32_t dwTexFormat;
+    /*!
+     * [out] GL type of the textures.
+     */
+    uint32_t dwTexType;
+    /*!
+     * [out] Size of the differential map.
+     *
+     * Only set if bWithDiffMap is set to NVFBC_TRUE.
+     */
+    NVFBC_SIZE diffMapSize;
+} NVFBC_TOGL_SETUP_PARAMS;
+
+/*!
+ * NVFBC_TOGL_SETUP_PARAMS structure version.
+ */
+#define NVFBC_TOGL_SETUP_PARAMS_VER NVFBC_STRUCT_VERSION(NVFBC_TOGL_SETUP_PARAMS, 2)
+
+/*!
+ * Defines parameters for the ::NvFBCToGLGrabFrame() API call.
+ */
+typedef struct _NVFBC_TOGL_GRAB_FRAME_PARAMS
+{
+    /*!
+     * [in] Must be set to NVFBC_TOGL_GRAB_FRAME_PARAMS_VER.
+     */
+    uint32_t dwVersion;
+    /*!
+     * [in] Flags defining the behavior of this frame capture.
+     */
+    uint32_t dwFlags;
+    /*!
+     * [out] Index of the texture storing the current frame.
+     *
+     * This is an index in the NVFBC_TOGL_SETUP_PARAMS::dwTextures array.
+     */
+    uint32_t dwTextureIndex;
+    /*!
+     * [out] Information about the captured frame.
+     *
+     * Can be NULL.
+     */
+    NVFBC_FRAME_GRAB_INFO *pFrameGrabInfo;
+    /*!
+     * [in] Wait timeout in milliseconds.
+     *
+     * When capturing frames with the NVFBC_TOGL_GRAB_FLAGS_NOFLAGS or
+     * NVFBC_TOGL_GRAB_FLAGS_NOWAIT_IF_NEW_FRAME_READY flags,
+     * NvFBC will wait for a new frame or mouse move until the below timer
+     * expires.
+     *
+     * When timing out, the last captured frame will be returned.  Note that as
+     * long as the NVFBC_TOGL_GRAB_FLAGS_FORCE_REFRESH flag is not set,
+     * returning an old frame will incur no performance penalty.
+     *
+     * NvFBC clients can use the return value of the grab frame operation to
+     * find out whether a new frame was captured, or the timer expired.
+     *
+     * Note that the behavior of blocking calls is to wait for a new frame
+     * *after* the call has been made.  When using timeouts, it is possible
+     * that NvFBC will return a new frame (e.g., it has never been captured
+     * before) even though no new frame was generated after the grab call.
+     *
+     * For the precise definition of what constitutes a new frame, see
+     * ::bIsNewFrame.
+     *
+     * Set to 0 to disable timeouts.
+     */
+    uint32_t dwTimeoutMs;
+} NVFBC_TOGL_GRAB_FRAME_PARAMS;
+
+/*!
+ * NVFBC_TOGL_GRAB_FRAME_PARAMS structure version.
+ */
+#define NVFBC_TOGL_GRAB_FRAME_PARAMS_VER NVFBC_STRUCT_VERSION(NVFBC_TOGL_GRAB_FRAME_PARAMS, 2)
+
+/*! @} FBC_STRUCT */
+
+/*!
+ * \defgroup FBC_FUNC API Entry Points
+ *
+ * Entry points are thread-safe and can be called concurrently.
+ *
+ * The locking model includes a global lock that protects session handle
+ * management (\see NvFBCCreateHandle, \see NvFBCDestroyHandle).
+ *
+ * Each NvFBC session uses a local lock to protect other entry points.  Note
+ * that in certain cases, a thread can hold the local lock for an undefined
+ * amount of time, such as grabbing a frame using a blocking call.
+ *
+ * Note that a context is associated with each session.  NvFBC clients wishing
+ * to share a session between different threads are expected to release and
+ * bind the context appropriately (\see NvFBCBindContext,
+ * \see NvFBCReleaseContext).  This is not required when each thread uses its
+ * own NvFBC session.
+ *
+ * @{
+ */
+
+/*!
+ * Gets the last error message that got recorded for a client.
+ *
+ * When NvFBC returns an error, it will save an error message that can be
+ * queried through this API call.  Only the last message is saved.
+ * The message and the return code should give enough information about
+ * what went wrong.
+ *
+ * \param [in] sessionHandle
+ *   Handle to the NvFBC client.
+ * \return
+ *   A NULL terminated error message, or an empty string.  Its maximum length
+ *   is NVFBC_ERROR_STR_LEN.
+ */
+const char* NVFBCAPI NvFBCGetLastErrorStr(const NVFBC_SESSION_HANDLE sessionHandle);
+
+/*!
+ * \brief Allocates a new handle for an NvFBC client.
+ *
+ * This function allocates a session handle used to identify an FBC client.
+ *
+ * This function implicitly calls NvFBCBindContext().
+ *
+ * \param [out] pSessionHandle
+ *   Pointer that will hold the allocated session handle.
+ * \param [in] pParams
+ *   ::NVFBC_CREATE_HANDLE_PARAMS
+ *
+ * \return
+ *   ::NVFBC_SUCCESS \n
+ *   ::NVFBC_ERR_INVALID_PTR \n
+ *   ::NVFBC_ERR_API_VERSION \n
+ *   ::NVFBC_ERR_INTERNAL \n
+ *   ::NVFBC_ERR_OUT_OF_MEMORY \n
+ *   ::NVFBC_ERR_MAX_CLIENTS \n
+ *   ::NVFBC_ERR_X \n
+ *   ::NVFBC_ERR_GLX \n
+ *   ::NVFBC_ERR_GL
+ *
+ */
+NVFBCSTATUS NVFBCAPI NvFBCCreateHandle(NVFBC_SESSION_HANDLE *pSessionHandle, NVFBC_CREATE_HANDLE_PARAMS *pParams);
+
+/*!
+ * \brief Destroys the handle of an NvFBC client.
+ *
+ * This function uninitializes an FBC client.
+ *
+ * This function implicitly calls NvFBCReleaseContext().
+ *
+ * After this fucntion returns, it is not possible to use this session handle
+ * for any further API call.
+ *
+ * \param [in] sessionHandle
+ *   FBC session handle.
+ * \param [in] pParams
+ *   ::NVFBC_DESTROY_HANDLE_PARAMS
+ *
+ * \return
+ *   ::NVFBC_SUCCESS \n
+ *   ::NVFBC_ERR_INVALID_HANDLE \n
+ *   ::NVFBC_ERR_API_VERSION \n
+ *   ::NVFBC_ERR_BAD_REQUEST \n
+ *   ::NVFBC_ERR_INTERNAL \n
+ *   ::NVFBC_ERR_CONTEXT \n
+ *   ::NVFBC_ERR_X
+ */
+NVFBCSTATUS NVFBCAPI NvFBCDestroyHandle(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_DESTROY_HANDLE_PARAMS *pParams);
+
+/*!
+ * \brief Gets the current status of the display driver.
+ *
+ * This function queries the display driver for various information.
+ *
+ * \param [in] sessionHandle
+ *   FBC session handle.
+ * \param [in] pParams
+ *   ::NVFBC_GET_STATUS_PARAMS
+ *
+ * \return
+ *   ::NVFBC_SUCCESS \n
+ *   ::NVFBC_ERR_INVALID_HANDLE \n
+ *   ::NVFBC_ERR_API_VERSION \n
+ *   ::NVFBC_ERR_INTERNAL \n
+ *   ::NVFBC_ERR_X
+ */
+NVFBCSTATUS NVFBCAPI NvFBCGetStatus(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_GET_STATUS_PARAMS *pParams);
+
+/*!
+ * \brief Binds the FBC context to the calling thread.
+ *
+ * The NvFBC library internally relies on objects that must be bound to a
+ * thread.  Such objects are OpenGL contexts and CUDA contexts.
+ *
+ * This function binds these objects to the calling thread.
+ *
+ * The FBC context must be bound to the calling thread for most NvFBC entry
+ * points, otherwise ::NVFBC_ERR_CONTEXT is returned.
+ *
+ * If the FBC context is already bound to a different thread,
+ * ::NVFBC_ERR_CONTEXT is returned.  The other thread must release the context
+ * first by calling the ReleaseContext() entry point.
+ *
+ * If the FBC context is already bound to the current thread, this function has
+ * no effects.
+ *
+ * \param [in] sessionHandle
+ *   FBC session handle.
+ * \param [in] pParams
+ *   ::NVFBC_DESTROY_CAPTURE_SESSION_PARAMS
+ *
+ * \return
+ *   ::NVFBC_SUCCESS \n
+ *   ::NVFBC_ERR_INVALID_HANDLE \n
+ *   ::NVFBC_ERR_API_VERSION \n
+ *   ::NVFBC_ERR_BAD_REQUEST \n
+ *   ::NVFBC_ERR_CONTEXT \n
+ *   ::NVFBC_ERR_INTERNAL \n
+ *   ::NVFBC_ERR_X
+ */
+NVFBCSTATUS NVFBCAPI NvFBCBindContext(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_BIND_CONTEXT_PARAMS *pParams);
+
+/*!
+ * \brief Releases the FBC context from the calling thread.
+ *
+ * If the FBC context is bound to a different thread, ::NVFBC_ERR_CONTEXT is
+ * returned.
+ *
+ * If the FBC context is already released, this functino has no effects.
+ *
+ * \param [in] sessionHandle
+ *   FBC session handle.
+ * \param [in] pParams
+ *   ::NVFBC_SUCCESS \n
+ *   ::NVFBC_ERR_INVALID_HANDLE \n
+ *   ::NVFBC_ERR_API_VERSION \n
+ *   ::NVFBC_ERR_BAD_REQUEST \n
+ *   ::NVFBC_ERR_CONTEXT \n
+ *   ::NVFBC_ERR_INTERNAL \n
+ *   ::NVFBC_ERR_X
+ */
+NVFBCSTATUS NVFBCAPI NvFBCReleaseContext(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_RELEASE_CONTEXT_PARAMS *pParams);
+
+/*!
+ * \brief Creates a capture session for an FBC client.
+ *
+ * This function starts a capture session of the desired type (system memory,
+ * video memory with CUDA interop, or H.264 compressed frames in system memory).
+ *
+ * Not all types are supported on all systems.  Also, it is possible to use
+ * NvFBC without having the CUDA library.  In this case, requesting a capture
+ * session of the concerned type will return an error.
+ *
+ * After this function returns, the display driver will start generating frames
+ * that can be captured using the corresponding API call.
+ *
+ * \param [in] sessionHandle
+ *   FBC session handle.
+ * \param [in] pParams
+ *   ::NVFBC_CREATE_CAPTURE_SESSION_PARAMS
+ *
+ * \return
+ *   ::NVFBC_SUCCESS \n
+ *   ::NVFBC_ERR_INVALID_HANDLE \n
+ *   ::NVFBC_ERR_API_VERSION \n
+ *   ::NVFBC_ERR_BAD_REQUEST \n
+ *   ::NVFBC_ERR_CONTEXT \n
+ *   ::NVFBC_ERR_INVALID_PARAM \n
+ *   ::NVFBC_ERR_OUT_OF_MEMORY \n
+ *   ::NVFBC_ERR_X \n
+ *   ::NVFBC_ERR_GLX \n
+ *   ::NVFBC_ERR_GL \n
+ *   ::NVFBC_ERR_CUDA \n
+ *   ::NVFBC_ERR_MUST_RECREATE \n
+ *   ::NVFBC_ERR_INTERNAL
+ */
+NVFBCSTATUS NVFBCAPI NvFBCCreateCaptureSession(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_CREATE_CAPTURE_SESSION_PARAMS *pParams);
+
+/*!
+ * \brief Destroys a capture session for an FBC client.
+ *
+ * This function stops a capture session and frees allocated objects.
+ *
+ * After this function returns, it is possible to create another capture
+ * session using the corresponding API call.
+ *
+ * \param [in] sessionHandle
+ *   FBC session handle.
+ * \param [in] pParams
+ *   ::NVFBC_DESTROY_CAPTURE_SESSION_PARAMS
+ *
+ * \return
+ *   ::NVFBC_SUCCESS \n
+ *   ::NVFBC_ERR_INVALID_HANDLE \n
+ *   ::NVFBC_ERR_API_VERSION \n
+ *   ::NVFBC_ERR_BAD_REQUEST \n
+ *   ::NVFBC_ERR_CONTEXT \n
+ *   ::NVFBC_ERR_INTERNAL \n
+ *   ::NVFBC_ERR_X
+ */
+NVFBCSTATUS NVFBCAPI NvFBCDestroyCaptureSession(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_DESTROY_CAPTURE_SESSION_PARAMS *pParams);
+
+/*!
+ * \brief Sets up a capture to system memory session.
+ *
+ * This function configures how the capture to system memory should behave. It
+ * can be called anytime and several times after the capture session has been
+ * created.  However, it must be called at least once prior to start capturing
+ * frames.
+ *
+ * This function allocates the buffer that will contain the captured frame.
+ * The application does not need to free this buffer.  The size of this buffer
+ * is returned in the ::NVFBC_FRAME_GRAB_INFO structure.
+ *
+ * \param [in] sessionHandle
+ *   FBC session handle.
+ * \param [in] pParams
+ *   ::NVFBC_TOSYS_SETUP_PARAMS
+ *
+ * \return
+ *   ::NVFBC_SUCCESS \n
+ *   ::NVFBC_ERR_INVALID_HANDLE \n
+ *   ::NVFBC_ERR_API_VERSION \n
+ *   ::NVFBC_ERR_BAD_REQUEST \n
+ *   ::NVFBC_ERR_INTERNAL \n
+ *   ::NVFBC_ERR_CONTEXT \n
+ *   ::NVFBC_ERR_UNSUPPORTED \n
+ *   ::NVFBC_ERR_INVALID_PTR \n
+ *   ::NVFBC_ERR_INVALID_PARAM \n
+ *   ::NVFBC_ERR_OUT_OF_MEMORY \n
+ *   ::NVFBC_ERR_X
+ */
+NVFBCSTATUS NVFBCAPI NvFBCToSysSetUp(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_TOSYS_SETUP_PARAMS *pParams);
+
+/*!
+ * \brief Captures a frame to a buffer in system memory.
+ *
+ * This function triggers a frame capture to a buffer in system memory that was
+ * registered with the ToSysSetUp() API call.
+ *
+ * Note that it is possible that the resolution of the desktop changes while
+ * capturing frames. This should be transparent for the application.
+ *
+ * When the resolution changes, the capture session is recreated using the same
+ * parameters, and necessary buffers are re-allocated. The frame counter is not
+ * reset.
+ *
+ * An application can detect that the resolution changed by comparing the
+ * dwByteSize member of the ::NVFBC_FRAME_GRAB_INFO against a previous
+ * frame and/or dwWidth and dwHeight.
+ *
+ * During a change of resolution the capture is paused even in asynchronous
+ * mode.
+ *
+ * \param [in] sessionHandle
+ *   FBC session handle.
+ * \param [in] pParams
+ *   ::NVFBC_TOSYS_GRAB_FRAME_PARAMS
+ *
+ * \return
+ *   ::NVFBC_SUCCESS \n
+ *   ::NVFBC_ERR_INVALID_HANDLE \n
+ *   ::NVFBC_ERR_API_VERSION \n
+ *   ::NVFBC_ERR_BAD_REQUEST \n
+ *   ::NVFBC_ERR_CONTEXT \n
+ *   ::NVFBC_ERR_INVALID_PTR \n
+ *   ::NVFBC_ERR_INTERNAL \n
+ *   ::NVFBC_ERR_X \n
+ *   ::NVFBC_ERR_MUST_RECREATE \n
+ *   \see NvFBCCreateCaptureSession \n
+ *   \see NvFBCToSysSetUp
+ */
+NVFBCSTATUS NVFBCAPI NvFBCToSysGrabFrame(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_TOSYS_GRAB_FRAME_PARAMS *pParams);
+
+/*!
+ * \brief Sets up a capture to video memory session.
+ *
+ * This function configures how the capture to video memory with CUDA interop
+ * should behave.  It can be called anytime and several times after the capture
+ * session has been created.  However, it must be called at least once prior
+ * to start capturing frames.
+ *
+ * \param [in] sessionHandle
+ *   FBC session handle.
+ *
+ * \param [in] pParams
+ *   ::NVFBC_TOCUDA_SETUP_PARAMS
+ *
+ * \return
+ *   ::NVFBC_SUCCESS \n
+ *   ::NVFBC_ERR_INVALID_HANDLE \n
+ *   ::NVFBC_ERR_API_VERSION \n
+ *   ::NVFBC_ERR_BAD_REQUEST \n
+ *   ::NVFBC_ERR_INTERNAL \n
+ *   ::NVFBC_ERR_CONTEXT \n
+ *   ::NVFBC_ERR_UNSUPPORTED \n
+ *   ::NVFBC_ERR_GL \n
+ *   ::NVFBC_ERR_X
+ */
+NVFBCSTATUS NVFBCAPI NvFBCToCudaSetUp(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_TOCUDA_SETUP_PARAMS *pParams);
+
+/*!
+ * \brief Captures a frame to a CUDA device in video memory.
+ *
+ * This function triggers a frame capture to a CUDA device in video memory.
+ *
+ * Note about changes of resolution: \see NvFBCToSysGrabFrame
+ *
+ * \param [in] sessionHandle
+ *   FBC session handle.
+ *
+ * \param [in] pParams
+ *   ::NVFBC_TOCUDA_GRAB_FRAME_PARAMS
+ *
+ * \return
+ *   ::NVFBC_SUCCESS \n
+ *   ::NVFBC_ERR_INVALID_HANDLE \n
+ *   ::NVFBC_ERR_API_VERSION \n
+ *   ::NVFBC_ERR_BAD_REQUEST \n
+ *   ::NVFBC_ERR_CONTEXT \n
+ *   ::NVFBC_ERR_INVALID_PTR \n
+ *   ::NVFBC_ERR_CUDA \n
+ *   ::NVFBC_ERR_INTERNAL \n
+ *   ::NVFBC_ERR_X \n
+ *   ::NVFBC_ERR_MUST_RECREATE \n
+ *   \see NvFBCCreateCaptureSession \n
+ *   \see NvFBCToCudaSetUp
+ */
+NVFBCSTATUS NVFBCAPI NvFBCToCudaGrabFrame(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_TOCUDA_GRAB_FRAME_PARAMS *pParams);
+
+/*!
+ * \brief Sets up a capture to OpenGL buffer in video memory session.
+ *
+ * This function configures how the capture to video memory should behave.
+ * It can be called anytime and several times after the capture session has been
+ * created.  However, it must be called at least once prior to start capturing
+ * frames.
+ *
+ * \param [in] sessionHandle
+ *   FBC session handle.
+ *
+ * \param [in] pParams
+ *   ::NVFBC_TOGL_SETUP_PARAMS
+ *
+ * \return
+ *   ::NVFBC_SUCCESS \n
+ *   ::NVFBC_ERR_INVALID_HANDLE \n
+ *   ::NVFBC_ERR_API_VERSION \n
+ *   ::NVFBC_ERR_BAD_REQUEST \n
+ *   ::NVFBC_ERR_INTERNAL \n
+ *   ::NVFBC_ERR_CONTEXT \n
+ *   ::NVFBC_ERR_UNSUPPORTED \n
+ *   ::NVFBC_ERR_GL \n
+ *   ::NVFBC_ERR_X
+ */
+NVFBCSTATUS NVFBCAPI NvFBCToGLSetUp(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_TOGL_SETUP_PARAMS *pParams);
+
+/*!
+ * \brief Captures a frame to an OpenGL buffer in video memory.
+ *
+ * This function triggers a frame capture to a selected resource in video memory.
+ *
+ * Note about changes of resolution: \see NvFBCToSysGrabFrame
+ *
+ * \param [in] sessionHandle
+ *   FBC session handle.
+ *
+ * \param [in] pParams
+ *   ::NVFBC_TOGL_GRAB_FRAME_PARAMS
+ *
+ * \return
+ *   ::NVFBC_SUCCESS \n
+ *   ::NVFBC_ERR_INVALID_HANDLE \n
+ *   ::NVFBC_ERR_API_VERSION \n
+ *   ::NVFBC_ERR_BAD_REQUEST \n
+ *   ::NVFBC_ERR_CONTEXT \n
+ *   ::NVFBC_ERR_INVALID_PTR \n
+ *   ::NVFBC_ERR_INTERNAL \n
+ *   ::NVFBC_ERR_X \n
+ *   ::NVFBC_ERR_MUST_RECREATE \n
+ *   \see NvFBCCreateCaptureSession \n
+ *   \see NvFBCToCudaSetUp
+ */
+NVFBCSTATUS NVFBCAPI NvFBCToGLGrabFrame(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_TOGL_GRAB_FRAME_PARAMS *pParams);
+
+/*!
+ * \cond FBC_PFN
+ *
+ * Defines API function pointers
+ */
+typedef const char* (NVFBCAPI* PNVFBCGETLASTERRORSTR)(const NVFBC_SESSION_HANDLE sessionHandle);
+typedef NVFBCSTATUS (NVFBCAPI* PNVFBCCREATEHANDLE)(NVFBC_SESSION_HANDLE *pSessionHandle, NVFBC_CREATE_HANDLE_PARAMS *pParams);
+typedef NVFBCSTATUS (NVFBCAPI* PNVFBCDESTROYHANDLE)(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_DESTROY_HANDLE_PARAMS *pParams);
+typedef NVFBCSTATUS (NVFBCAPI* PNVFBCBINDCONTEXT)(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_BIND_CONTEXT_PARAMS *pParams);
+typedef NVFBCSTATUS (NVFBCAPI* PNVFBCRELEASECONTEXT)(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_RELEASE_CONTEXT_PARAMS *pParams);
+typedef NVFBCSTATUS (NVFBCAPI* PNVFBCGETSTATUS)(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_GET_STATUS_PARAMS *pParams);
+typedef NVFBCSTATUS (NVFBCAPI* PNVFBCCREATECAPTURESESSION)(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_CREATE_CAPTURE_SESSION_PARAMS *pParams);
+typedef NVFBCSTATUS (NVFBCAPI* PNVFBCDESTROYCAPTURESESSION)(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_DESTROY_CAPTURE_SESSION_PARAMS *pParams);
+typedef NVFBCSTATUS (NVFBCAPI* PNVFBCTOSYSSETUP)(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_TOSYS_SETUP_PARAMS *pParams);
+typedef NVFBCSTATUS (NVFBCAPI* PNVFBCTOSYSGRABFRAME)(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_TOSYS_GRAB_FRAME_PARAMS *pParams);
+typedef NVFBCSTATUS (NVFBCAPI* PNVFBCTOCUDASETUP)(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_TOCUDA_SETUP_PARAMS *pParams);
+typedef NVFBCSTATUS (NVFBCAPI* PNVFBCTOCUDAGRABFRAME)(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_TOCUDA_GRAB_FRAME_PARAMS *pParams);
+typedef NVFBCSTATUS (NVFBCAPI* PNVFBCTOGLSETUP)(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_TOGL_SETUP_PARAMS *pParams);
+typedef NVFBCSTATUS (NVFBCAPI* PNVFBCTOGLGRABFRAME)(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_TOGL_GRAB_FRAME_PARAMS *pParams);
+
+/// \endcond
+
+/*! @} FBC_FUNC */
+
+/*!
+ * \ingroup FBC_STRUCT
+ *
+ * Structure populated with API function pointers.
+ */
+typedef struct
+{
+    uint32_t                                  dwVersion;                  //!< [in] Must be set to NVFBC_VERSION.
+    PNVFBCGETLASTERRORSTR                     nvFBCGetLastErrorStr;       //!< [out] Pointer to ::NvFBCGetLastErrorStr().
+    PNVFBCCREATEHANDLE                        nvFBCCreateHandle;          //!< [out] Pointer to ::NvFBCCreateHandle().
+    PNVFBCDESTROYHANDLE                       nvFBCDestroyHandle;         //!< [out] Pointer to ::NvFBCDestroyHandle().
+    PNVFBCGETSTATUS                           nvFBCGetStatus;             //!< [out] Pointer to ::NvFBCGetStatus().
+    PNVFBCCREATECAPTURESESSION                nvFBCCreateCaptureSession;  //!< [out] Pointer to ::NvFBCCreateCaptureSession().
+    PNVFBCDESTROYCAPTURESESSION               nvFBCDestroyCaptureSession; //!< [out] Pointer to ::NvFBCDestroyCaptureSession().
+    PNVFBCTOSYSSETUP                          nvFBCToSysSetUp;            //!< [out] Pointer to ::NvFBCToSysSetUp().
+    PNVFBCTOSYSGRABFRAME                      nvFBCToSysGrabFrame;        //!< [out] Pointer to ::NvFBCToSysGrabFrame().
+    PNVFBCTOCUDASETUP                         nvFBCToCudaSetUp;           //!< [out] Pointer to ::NvFBCToCudaSetUp().
+    PNVFBCTOCUDAGRABFRAME                     nvFBCToCudaGrabFrame;       //!< [out] Pointer to ::NvFBCToCudaGrabFrame().
+    void*                                     pad1;                       //!< [out] Retired. Do not use.
+    void*                                     pad2;                       //!< [out] Retired. Do not use.
+    void*                                     pad3;                       //!< [out] Retired. Do not use.
+    PNVFBCBINDCONTEXT                         nvFBCBindContext;           //!< [out] Pointer to ::NvFBCBindContext().
+    PNVFBCRELEASECONTEXT                      nvFBCReleaseContext;        //!< [out] Pointer to ::NvFBCReleaseContext().
+    void*                                     pad4;                       //!< [out] Retired. Do not use.
+    void*                                     pad5;                       //!< [out] Retired. Do not use.
+    void*                                     pad6;                       //!< [out] Retired. Do not use.
+    void*                                     pad7;                       //!< [out] Retired. Do not use.
+    PNVFBCTOGLSETUP                           nvFBCToGLSetUp;             //!< [out] Pointer to ::nvFBCToGLSetup().
+    PNVFBCTOGLGRABFRAME                       nvFBCToGLGrabFrame;         //!< [out] Pointer to ::nvFBCToGLGrabFrame().
+} NVFBC_API_FUNCTION_LIST;
+
+/*!
+ * \ingroup FBC_FUNC
+ *
+ * \brief Entry Points to the NvFBC interface.
+ *
+ * Creates an instance of the NvFBC interface, and populates the
+ * pFunctionList with function pointers to the API routines implemented by
+ * the NvFBC interface.
+ *
+ * \param [out] pFunctionList
+ *
+ * \return
+ *   ::NVFBC_SUCCESS \n
+ *   ::NVFBC_ERR_INVALID_PTR \n
+ *   ::NVFBC_ERR_API_VERSION
+ */
+NVFBCSTATUS NVFBCAPI NvFBCCreateInstance(NVFBC_API_FUNCTION_LIST *pFunctionList);
+/*!
+ * \ingroup FBC_FUNC
+ *
+ * Defines function pointer for the ::NvFBCCreateInstance() API call.
+ */
+typedef NVFBCSTATUS (NVFBCAPI* PNVFBCCREATEINSTANCE)(NVFBC_API_FUNCTION_LIST *pFunctionList);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // _NVFBC_H_
diff --git a/third-party/nvfbc/helper_math.h b/third-party/nvfbc/helper_math.h
new file mode 100644
index 00000000..d17b024e
--- /dev/null
+++ b/third-party/nvfbc/helper_math.h
@@ -0,0 +1,1469 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ *  This file implements common mathematical operations on vector types
+ *  (float3, float4 etc.) since these are not provided as standard by CUDA.
+ *
+ *  The syntax is modeled on the Cg standard library.
+ *
+ *  This is part of the Helper library includes
+ *
+ *    Thanks to Linh Hah for additions and fixes.
+ */
+
+#ifndef HELPER_MATH_H
+#define HELPER_MATH_H
+
+#include "cuda_runtime.h"
+
+typedef unsigned int uint;
+typedef unsigned short ushort;
+
+#ifndef EXIT_WAIVED
+#define EXIT_WAIVED 2
+#endif
+
+#ifndef __CUDACC__
+#include <math.h>
+
+////////////////////////////////////////////////////////////////////////////////
+// host implementations of CUDA functions
+////////////////////////////////////////////////////////////////////////////////
+
+inline float fminf(float a, float b)
+{
+    return a < b ? a : b;
+}
+
+inline float fmaxf(float a, float b)
+{
+    return a > b ? a : b;
+}
+
+inline int max(int a, int b)
+{
+    return a > b ? a : b;
+}
+
+inline int min(int a, int b)
+{
+    return a < b ? a : b;
+}
+
+inline float rsqrtf(float x)
+{
+    return 1.0f / sqrtf(x);
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// constructors
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float2 make_float2(float s)
+{
+    return make_float2(s, s);
+}
+inline __host__ __device__ float2 make_float2(float3 a)
+{
+    return make_float2(a.x, a.y);
+}
+inline __host__ __device__ float2 make_float2(int2 a)
+{
+    return make_float2(float(a.x), float(a.y));
+}
+inline __host__ __device__ float2 make_float2(uint2 a)
+{
+    return make_float2(float(a.x), float(a.y));
+}
+
+inline __host__ __device__ int2 make_int2(int s)
+{
+    return make_int2(s, s);
+}
+inline __host__ __device__ int2 make_int2(int3 a)
+{
+    return make_int2(a.x, a.y);
+}
+inline __host__ __device__ int2 make_int2(uint2 a)
+{
+    return make_int2(int(a.x), int(a.y));
+}
+inline __host__ __device__ int2 make_int2(float2 a)
+{
+    return make_int2(int(a.x), int(a.y));
+}
+
+inline __host__ __device__ uint2 make_uint2(uint s)
+{
+    return make_uint2(s, s);
+}
+inline __host__ __device__ uint2 make_uint2(uint3 a)
+{
+    return make_uint2(a.x, a.y);
+}
+inline __host__ __device__ uint2 make_uint2(int2 a)
+{
+    return make_uint2(uint(a.x), uint(a.y));
+}
+
+inline __host__ __device__ float3 make_float3(float s)
+{
+    return make_float3(s, s, s);
+}
+inline __host__ __device__ float3 make_float3(float2 a)
+{
+    return make_float3(a.x, a.y, 0.0f);
+}
+inline __host__ __device__ float3 make_float3(float2 a, float s)
+{
+    return make_float3(a.x, a.y, s);
+}
+inline __host__ __device__ float3 make_float3(float4 a)
+{
+    return make_float3(a.x, a.y, a.z);
+}
+inline __host__ __device__ float3 make_float3(int3 a)
+{
+    return make_float3(float(a.x), float(a.y), float(a.z));
+}
+inline __host__ __device__ float3 make_float3(uint3 a)
+{
+    return make_float3(float(a.x), float(a.y), float(a.z));
+}
+
+inline __host__ __device__ int3 make_int3(int s)
+{
+    return make_int3(s, s, s);
+}
+inline __host__ __device__ int3 make_int3(int2 a)
+{
+    return make_int3(a.x, a.y, 0);
+}
+inline __host__ __device__ int3 make_int3(int2 a, int s)
+{
+    return make_int3(a.x, a.y, s);
+}
+inline __host__ __device__ int3 make_int3(uint3 a)
+{
+    return make_int3(int(a.x), int(a.y), int(a.z));
+}
+inline __host__ __device__ int3 make_int3(float3 a)
+{
+    return make_int3(int(a.x), int(a.y), int(a.z));
+}
+
+inline __host__ __device__ uint3 make_uint3(uint s)
+{
+    return make_uint3(s, s, s);
+}
+inline __host__ __device__ uint3 make_uint3(uint2 a)
+{
+    return make_uint3(a.x, a.y, 0);
+}
+inline __host__ __device__ uint3 make_uint3(uint2 a, uint s)
+{
+    return make_uint3(a.x, a.y, s);
+}
+inline __host__ __device__ uint3 make_uint3(uint4 a)
+{
+    return make_uint3(a.x, a.y, a.z);
+}
+inline __host__ __device__ uint3 make_uint3(int3 a)
+{
+    return make_uint3(uint(a.x), uint(a.y), uint(a.z));
+}
+
+inline __host__ __device__ float4 make_float4(float s)
+{
+    return make_float4(s, s, s, s);
+}
+inline __host__ __device__ float4 make_float4(float3 a)
+{
+    return make_float4(a.x, a.y, a.z, 0.0f);
+}
+inline __host__ __device__ float4 make_float4(float3 a, float w)
+{
+    return make_float4(a.x, a.y, a.z, w);
+}
+inline __host__ __device__ float4 make_float4(int4 a)
+{
+    return make_float4(float(a.x), float(a.y), float(a.z), float(a.w));
+}
+inline __host__ __device__ float4 make_float4(uint4 a)
+{
+    return make_float4(float(a.x), float(a.y), float(a.z), float(a.w));
+}
+
+inline __host__ __device__ int4 make_int4(int s)
+{
+    return make_int4(s, s, s, s);
+}
+inline __host__ __device__ int4 make_int4(int3 a)
+{
+    return make_int4(a.x, a.y, a.z, 0);
+}
+inline __host__ __device__ int4 make_int4(int3 a, int w)
+{
+    return make_int4(a.x, a.y, a.z, w);
+}
+inline __host__ __device__ int4 make_int4(uint4 a)
+{
+    return make_int4(int(a.x), int(a.y), int(a.z), int(a.w));
+}
+inline __host__ __device__ int4 make_int4(float4 a)
+{
+    return make_int4(int(a.x), int(a.y), int(a.z), int(a.w));
+}
+
+
+inline __host__ __device__ uint4 make_uint4(uint s)
+{
+    return make_uint4(s, s, s, s);
+}
+inline __host__ __device__ uint4 make_uint4(uint3 a)
+{
+    return make_uint4(a.x, a.y, a.z, 0);
+}
+inline __host__ __device__ uint4 make_uint4(uint3 a, uint w)
+{
+    return make_uint4(a.x, a.y, a.z, w);
+}
+inline __host__ __device__ uint4 make_uint4(int4 a)
+{
+    return make_uint4(uint(a.x), uint(a.y), uint(a.z), uint(a.w));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// negate
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float2 operator-(float2 &a)
+{
+    return make_float2(-a.x, -a.y);
+}
+inline __host__ __device__ int2 operator-(int2 &a)
+{
+    return make_int2(-a.x, -a.y);
+}
+inline __host__ __device__ float3 operator-(float3 &a)
+{
+    return make_float3(-a.x, -a.y, -a.z);
+}
+inline __host__ __device__ int3 operator-(int3 &a)
+{
+    return make_int3(-a.x, -a.y, -a.z);
+}
+inline __host__ __device__ float4 operator-(float4 &a)
+{
+    return make_float4(-a.x, -a.y, -a.z, -a.w);
+}
+inline __host__ __device__ int4 operator-(int4 &a)
+{
+    return make_int4(-a.x, -a.y, -a.z, -a.w);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// addition
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float2 operator+(float2 a, float2 b)
+{
+    return make_float2(a.x + b.x, a.y + b.y);
+}
+inline __host__ __device__ void operator+=(float2 &a, float2 b)
+{
+    a.x += b.x;
+    a.y += b.y;
+}
+inline __host__ __device__ float2 operator+(float2 a, float b)
+{
+    return make_float2(a.x + b, a.y + b);
+}
+inline __host__ __device__ float2 operator+(float b, float2 a)
+{
+    return make_float2(a.x + b, a.y + b);
+}
+inline __host__ __device__ void operator+=(float2 &a, float b)
+{
+    a.x += b;
+    a.y += b;
+}
+
+inline __host__ __device__ int2 operator+(int2 a, int2 b)
+{
+    return make_int2(a.x + b.x, a.y + b.y);
+}
+inline __host__ __device__ void operator+=(int2 &a, int2 b)
+{
+    a.x += b.x;
+    a.y += b.y;
+}
+inline __host__ __device__ int2 operator+(int2 a, int b)
+{
+    return make_int2(a.x + b, a.y + b);
+}
+inline __host__ __device__ int2 operator+(int b, int2 a)
+{
+    return make_int2(a.x + b, a.y + b);
+}
+inline __host__ __device__ void operator+=(int2 &a, int b)
+{
+    a.x += b;
+    a.y += b;
+}
+
+inline __host__ __device__ uint2 operator+(uint2 a, uint2 b)
+{
+    return make_uint2(a.x + b.x, a.y + b.y);
+}
+inline __host__ __device__ void operator+=(uint2 &a, uint2 b)
+{
+    a.x += b.x;
+    a.y += b.y;
+}
+inline __host__ __device__ uint2 operator+(uint2 a, uint b)
+{
+    return make_uint2(a.x + b, a.y + b);
+}
+inline __host__ __device__ uint2 operator+(uint b, uint2 a)
+{
+    return make_uint2(a.x + b, a.y + b);
+}
+inline __host__ __device__ void operator+=(uint2 &a, uint b)
+{
+    a.x += b;
+    a.y += b;
+}
+
+
+inline __host__ __device__ float3 operator+(float3 a, float3 b)
+{
+    return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
+}
+inline __host__ __device__ void operator+=(float3 &a, float3 b)
+{
+    a.x += b.x;
+    a.y += b.y;
+    a.z += b.z;
+}
+inline __host__ __device__ float3 operator+(float3 a, float b)
+{
+    return make_float3(a.x + b, a.y + b, a.z + b);
+}
+inline __host__ __device__ void operator+=(float3 &a, float b)
+{
+    a.x += b;
+    a.y += b;
+    a.z += b;
+}
+
+inline __host__ __device__ int3 operator+(int3 a, int3 b)
+{
+    return make_int3(a.x + b.x, a.y + b.y, a.z + b.z);
+}
+inline __host__ __device__ void operator+=(int3 &a, int3 b)
+{
+    a.x += b.x;
+    a.y += b.y;
+    a.z += b.z;
+}
+inline __host__ __device__ int3 operator+(int3 a, int b)
+{
+    return make_int3(a.x + b, a.y + b, a.z + b);
+}
+inline __host__ __device__ void operator+=(int3 &a, int b)
+{
+    a.x += b;
+    a.y += b;
+    a.z += b;
+}
+
+inline __host__ __device__ uint3 operator+(uint3 a, uint3 b)
+{
+    return make_uint3(a.x + b.x, a.y + b.y, a.z + b.z);
+}
+inline __host__ __device__ void operator+=(uint3 &a, uint3 b)
+{
+    a.x += b.x;
+    a.y += b.y;
+    a.z += b.z;
+}
+inline __host__ __device__ uint3 operator+(uint3 a, uint b)
+{
+    return make_uint3(a.x + b, a.y + b, a.z + b);
+}
+inline __host__ __device__ void operator+=(uint3 &a, uint b)
+{
+    a.x += b;
+    a.y += b;
+    a.z += b;
+}
+
+inline __host__ __device__ int3 operator+(int b, int3 a)
+{
+    return make_int3(a.x + b, a.y + b, a.z + b);
+}
+inline __host__ __device__ uint3 operator+(uint b, uint3 a)
+{
+    return make_uint3(a.x + b, a.y + b, a.z + b);
+}
+inline __host__ __device__ float3 operator+(float b, float3 a)
+{
+    return make_float3(a.x + b, a.y + b, a.z + b);
+}
+
+inline __host__ __device__ float4 operator+(float4 a, float4 b)
+{
+    return make_float4(a.x + b.x, a.y + b.y, a.z + b.z,  a.w + b.w);
+}
+inline __host__ __device__ void operator+=(float4 &a, float4 b)
+{
+    a.x += b.x;
+    a.y += b.y;
+    a.z += b.z;
+    a.w += b.w;
+}
+inline __host__ __device__ float4 operator+(float4 a, float b)
+{
+    return make_float4(a.x + b, a.y + b, a.z + b, a.w + b);
+}
+inline __host__ __device__ float4 operator+(float b, float4 a)
+{
+    return make_float4(a.x + b, a.y + b, a.z + b, a.w + b);
+}
+inline __host__ __device__ void operator+=(float4 &a, float b)
+{
+    a.x += b;
+    a.y += b;
+    a.z += b;
+    a.w += b;
+}
+
+inline __host__ __device__ int4 operator+(int4 a, int4 b)
+{
+    return make_int4(a.x + b.x, a.y + b.y, a.z + b.z,  a.w + b.w);
+}
+inline __host__ __device__ void operator+=(int4 &a, int4 b)
+{
+    a.x += b.x;
+    a.y += b.y;
+    a.z += b.z;
+    a.w += b.w;
+}
+inline __host__ __device__ int4 operator+(int4 a, int b)
+{
+    return make_int4(a.x + b, a.y + b, a.z + b,  a.w + b);
+}
+inline __host__ __device__ int4 operator+(int b, int4 a)
+{
+    return make_int4(a.x + b, a.y + b, a.z + b,  a.w + b);
+}
+inline __host__ __device__ void operator+=(int4 &a, int b)
+{
+    a.x += b;
+    a.y += b;
+    a.z += b;
+    a.w += b;
+}
+
+inline __host__ __device__ uint4 operator+(uint4 a, uint4 b)
+{
+    return make_uint4(a.x + b.x, a.y + b.y, a.z + b.z,  a.w + b.w);
+}
+inline __host__ __device__ void operator+=(uint4 &a, uint4 b)
+{
+    a.x += b.x;
+    a.y += b.y;
+    a.z += b.z;
+    a.w += b.w;
+}
+inline __host__ __device__ uint4 operator+(uint4 a, uint b)
+{
+    return make_uint4(a.x + b, a.y + b, a.z + b,  a.w + b);
+}
+inline __host__ __device__ uint4 operator+(uint b, uint4 a)
+{
+    return make_uint4(a.x + b, a.y + b, a.z + b,  a.w + b);
+}
+inline __host__ __device__ void operator+=(uint4 &a, uint b)
+{
+    a.x += b;
+    a.y += b;
+    a.z += b;
+    a.w += b;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// subtract
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float2 operator-(float2 a, float2 b)
+{
+    return make_float2(a.x - b.x, a.y - b.y);
+}
+inline __host__ __device__ void operator-=(float2 &a, float2 b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+}
+inline __host__ __device__ float2 operator-(float2 a, float b)
+{
+    return make_float2(a.x - b, a.y - b);
+}
+inline __host__ __device__ float2 operator-(float b, float2 a)
+{
+    return make_float2(b - a.x, b - a.y);
+}
+inline __host__ __device__ void operator-=(float2 &a, float b)
+{
+    a.x -= b;
+    a.y -= b;
+}
+
+inline __host__ __device__ int2 operator-(int2 a, int2 b)
+{
+    return make_int2(a.x - b.x, a.y - b.y);
+}
+inline __host__ __device__ void operator-=(int2 &a, int2 b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+}
+inline __host__ __device__ int2 operator-(int2 a, int b)
+{
+    return make_int2(a.x - b, a.y - b);
+}
+inline __host__ __device__ int2 operator-(int b, int2 a)
+{
+    return make_int2(b - a.x, b - a.y);
+}
+inline __host__ __device__ void operator-=(int2 &a, int b)
+{
+    a.x -= b;
+    a.y -= b;
+}
+
+inline __host__ __device__ uint2 operator-(uint2 a, uint2 b)
+{
+    return make_uint2(a.x - b.x, a.y - b.y);
+}
+inline __host__ __device__ void operator-=(uint2 &a, uint2 b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+}
+inline __host__ __device__ uint2 operator-(uint2 a, uint b)
+{
+    return make_uint2(a.x - b, a.y - b);
+}
+inline __host__ __device__ uint2 operator-(uint b, uint2 a)
+{
+    return make_uint2(b - a.x, b - a.y);
+}
+inline __host__ __device__ void operator-=(uint2 &a, uint b)
+{
+    a.x -= b;
+    a.y -= b;
+}
+
+inline __host__ __device__ float3 operator-(float3 a, float3 b)
+{
+    return make_float3(a.x - b.x, a.y - b.y, a.z - b.z);
+}
+inline __host__ __device__ void operator-=(float3 &a, float3 b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+    a.z -= b.z;
+}
+inline __host__ __device__ float3 operator-(float3 a, float b)
+{
+    return make_float3(a.x - b, a.y - b, a.z - b);
+}
+inline __host__ __device__ float3 operator-(float b, float3 a)
+{
+    return make_float3(b - a.x, b - a.y, b - a.z);
+}
+inline __host__ __device__ void operator-=(float3 &a, float b)
+{
+    a.x -= b;
+    a.y -= b;
+    a.z -= b;
+}
+
+inline __host__ __device__ int3 operator-(int3 a, int3 b)
+{
+    return make_int3(a.x - b.x, a.y - b.y, a.z - b.z);
+}
+inline __host__ __device__ void operator-=(int3 &a, int3 b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+    a.z -= b.z;
+}
+inline __host__ __device__ int3 operator-(int3 a, int b)
+{
+    return make_int3(a.x - b, a.y - b, a.z - b);
+}
+inline __host__ __device__ int3 operator-(int b, int3 a)
+{
+    return make_int3(b - a.x, b - a.y, b - a.z);
+}
+inline __host__ __device__ void operator-=(int3 &a, int b)
+{
+    a.x -= b;
+    a.y -= b;
+    a.z -= b;
+}
+
+inline __host__ __device__ uint3 operator-(uint3 a, uint3 b)
+{
+    return make_uint3(a.x - b.x, a.y - b.y, a.z - b.z);
+}
+inline __host__ __device__ void operator-=(uint3 &a, uint3 b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+    a.z -= b.z;
+}
+inline __host__ __device__ uint3 operator-(uint3 a, uint b)
+{
+    return make_uint3(a.x - b, a.y - b, a.z - b);
+}
+inline __host__ __device__ uint3 operator-(uint b, uint3 a)
+{
+    return make_uint3(b - a.x, b - a.y, b - a.z);
+}
+inline __host__ __device__ void operator-=(uint3 &a, uint b)
+{
+    a.x -= b;
+    a.y -= b;
+    a.z -= b;
+}
+
+inline __host__ __device__ float4 operator-(float4 a, float4 b)
+{
+    return make_float4(a.x - b.x, a.y - b.y, a.z - b.z,  a.w - b.w);
+}
+inline __host__ __device__ void operator-=(float4 &a, float4 b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+    a.z -= b.z;
+    a.w -= b.w;
+}
+inline __host__ __device__ float4 operator-(float4 a, float b)
+{
+    return make_float4(a.x - b, a.y - b, a.z - b,  a.w - b);
+}
+inline __host__ __device__ void operator-=(float4 &a, float b)
+{
+    a.x -= b;
+    a.y -= b;
+    a.z -= b;
+    a.w -= b;
+}
+
+inline __host__ __device__ int4 operator-(int4 a, int4 b)
+{
+    return make_int4(a.x - b.x, a.y - b.y, a.z - b.z,  a.w - b.w);
+}
+inline __host__ __device__ void operator-=(int4 &a, int4 b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+    a.z -= b.z;
+    a.w -= b.w;
+}
+inline __host__ __device__ int4 operator-(int4 a, int b)
+{
+    return make_int4(a.x - b, a.y - b, a.z - b,  a.w - b);
+}
+inline __host__ __device__ int4 operator-(int b, int4 a)
+{
+    return make_int4(b - a.x, b - a.y, b - a.z, b - a.w);
+}
+inline __host__ __device__ void operator-=(int4 &a, int b)
+{
+    a.x -= b;
+    a.y -= b;
+    a.z -= b;
+    a.w -= b;
+}
+
+inline __host__ __device__ uint4 operator-(uint4 a, uint4 b)
+{
+    return make_uint4(a.x - b.x, a.y - b.y, a.z - b.z,  a.w - b.w);
+}
+inline __host__ __device__ void operator-=(uint4 &a, uint4 b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+    a.z -= b.z;
+    a.w -= b.w;
+}
+inline __host__ __device__ uint4 operator-(uint4 a, uint b)
+{
+    return make_uint4(a.x - b, a.y - b, a.z - b,  a.w - b);
+}
+inline __host__ __device__ uint4 operator-(uint b, uint4 a)
+{
+    return make_uint4(b - a.x, b - a.y, b - a.z, b - a.w);
+}
+inline __host__ __device__ void operator-=(uint4 &a, uint b)
+{
+    a.x -= b;
+    a.y -= b;
+    a.z -= b;
+    a.w -= b;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// multiply
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float2 operator*(float2 a, float2 b)
+{
+    return make_float2(a.x * b.x, a.y * b.y);
+}
+inline __host__ __device__ void operator*=(float2 &a, float2 b)
+{
+    a.x *= b.x;
+    a.y *= b.y;
+}
+inline __host__ __device__ float2 operator*(float2 a, float b)
+{
+    return make_float2(a.x * b, a.y * b);
+}
+inline __host__ __device__ float2 operator*(float b, float2 a)
+{
+    return make_float2(b * a.x, b * a.y);
+}
+inline __host__ __device__ void operator*=(float2 &a, float b)
+{
+    a.x *= b;
+    a.y *= b;
+}
+
+inline __host__ __device__ int2 operator*(int2 a, int2 b)
+{
+    return make_int2(a.x * b.x, a.y * b.y);
+}
+inline __host__ __device__ void operator*=(int2 &a, int2 b)
+{
+    a.x *= b.x;
+    a.y *= b.y;
+}
+inline __host__ __device__ int2 operator*(int2 a, int b)
+{
+    return make_int2(a.x * b, a.y * b);
+}
+inline __host__ __device__ int2 operator*(int b, int2 a)
+{
+    return make_int2(b * a.x, b * a.y);
+}
+inline __host__ __device__ void operator*=(int2 &a, int b)
+{
+    a.x *= b;
+    a.y *= b;
+}
+
+inline __host__ __device__ uint2 operator*(uint2 a, uint2 b)
+{
+    return make_uint2(a.x * b.x, a.y * b.y);
+}
+inline __host__ __device__ void operator*=(uint2 &a, uint2 b)
+{
+    a.x *= b.x;
+    a.y *= b.y;
+}
+inline __host__ __device__ uint2 operator*(uint2 a, uint b)
+{
+    return make_uint2(a.x * b, a.y * b);
+}
+inline __host__ __device__ uint2 operator*(uint b, uint2 a)
+{
+    return make_uint2(b * a.x, b * a.y);
+}
+inline __host__ __device__ void operator*=(uint2 &a, uint b)
+{
+    a.x *= b;
+    a.y *= b;
+}
+
+inline __host__ __device__ float3 operator*(float3 a, float3 b)
+{
+    return make_float3(a.x * b.x, a.y * b.y, a.z * b.z);
+}
+inline __host__ __device__ void operator*=(float3 &a, float3 b)
+{
+    a.x *= b.x;
+    a.y *= b.y;
+    a.z *= b.z;
+}
+inline __host__ __device__ float3 operator*(float3 a, float b)
+{
+    return make_float3(a.x * b, a.y * b, a.z * b);
+}
+inline __host__ __device__ float3 operator*(float b, float3 a)
+{
+    return make_float3(b * a.x, b * a.y, b * a.z);
+}
+inline __host__ __device__ void operator*=(float3 &a, float b)
+{
+    a.x *= b;
+    a.y *= b;
+    a.z *= b;
+}
+
+inline __host__ __device__ int3 operator*(int3 a, int3 b)
+{
+    return make_int3(a.x * b.x, a.y * b.y, a.z * b.z);
+}
+inline __host__ __device__ void operator*=(int3 &a, int3 b)
+{
+    a.x *= b.x;
+    a.y *= b.y;
+    a.z *= b.z;
+}
+inline __host__ __device__ int3 operator*(int3 a, int b)
+{
+    return make_int3(a.x * b, a.y * b, a.z * b);
+}
+inline __host__ __device__ int3 operator*(int b, int3 a)
+{
+    return make_int3(b * a.x, b * a.y, b * a.z);
+}
+inline __host__ __device__ void operator*=(int3 &a, int b)
+{
+    a.x *= b;
+    a.y *= b;
+    a.z *= b;
+}
+
+inline __host__ __device__ uint3 operator*(uint3 a, uint3 b)
+{
+    return make_uint3(a.x * b.x, a.y * b.y, a.z * b.z);
+}
+inline __host__ __device__ void operator*=(uint3 &a, uint3 b)
+{
+    a.x *= b.x;
+    a.y *= b.y;
+    a.z *= b.z;
+}
+inline __host__ __device__ uint3 operator*(uint3 a, uint b)
+{
+    return make_uint3(a.x * b, a.y * b, a.z * b);
+}
+inline __host__ __device__ uint3 operator*(uint b, uint3 a)
+{
+    return make_uint3(b * a.x, b * a.y, b * a.z);
+}
+inline __host__ __device__ void operator*=(uint3 &a, uint b)
+{
+    a.x *= b;
+    a.y *= b;
+    a.z *= b;
+}
+
+inline __host__ __device__ float4 operator*(float4 a, float4 b)
+{
+    return make_float4(a.x * b.x, a.y * b.y, a.z * b.z,  a.w * b.w);
+}
+inline __host__ __device__ void operator*=(float4 &a, float4 b)
+{
+    a.x *= b.x;
+    a.y *= b.y;
+    a.z *= b.z;
+    a.w *= b.w;
+}
+inline __host__ __device__ float4 operator*(float4 a, float b)
+{
+    return make_float4(a.x * b, a.y * b, a.z * b,  a.w * b);
+}
+inline __host__ __device__ float4 operator*(float b, float4 a)
+{
+    return make_float4(b * a.x, b * a.y, b * a.z, b * a.w);
+}
+inline __host__ __device__ void operator*=(float4 &a, float b)
+{
+    a.x *= b;
+    a.y *= b;
+    a.z *= b;
+    a.w *= b;
+}
+
+inline __host__ __device__ int4 operator*(int4 a, int4 b)
+{
+    return make_int4(a.x * b.x, a.y * b.y, a.z * b.z,  a.w * b.w);
+}
+inline __host__ __device__ void operator*=(int4 &a, int4 b)
+{
+    a.x *= b.x;
+    a.y *= b.y;
+    a.z *= b.z;
+    a.w *= b.w;
+}
+inline __host__ __device__ int4 operator*(int4 a, int b)
+{
+    return make_int4(a.x * b, a.y * b, a.z * b,  a.w * b);
+}
+inline __host__ __device__ int4 operator*(int b, int4 a)
+{
+    return make_int4(b * a.x, b * a.y, b * a.z, b * a.w);
+}
+inline __host__ __device__ void operator*=(int4 &a, int b)
+{
+    a.x *= b;
+    a.y *= b;
+    a.z *= b;
+    a.w *= b;
+}
+
+inline __host__ __device__ uint4 operator*(uint4 a, uint4 b)
+{
+    return make_uint4(a.x * b.x, a.y * b.y, a.z * b.z,  a.w * b.w);
+}
+inline __host__ __device__ void operator*=(uint4 &a, uint4 b)
+{
+    a.x *= b.x;
+    a.y *= b.y;
+    a.z *= b.z;
+    a.w *= b.w;
+}
+inline __host__ __device__ uint4 operator*(uint4 a, uint b)
+{
+    return make_uint4(a.x * b, a.y * b, a.z * b,  a.w * b);
+}
+inline __host__ __device__ uint4 operator*(uint b, uint4 a)
+{
+    return make_uint4(b * a.x, b * a.y, b * a.z, b * a.w);
+}
+inline __host__ __device__ void operator*=(uint4 &a, uint b)
+{
+    a.x *= b;
+    a.y *= b;
+    a.z *= b;
+    a.w *= b;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// divide
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float2 operator/(float2 a, float2 b)
+{
+    return make_float2(a.x / b.x, a.y / b.y);
+}
+inline __host__ __device__ void operator/=(float2 &a, float2 b)
+{
+    a.x /= b.x;
+    a.y /= b.y;
+}
+inline __host__ __device__ float2 operator/(float2 a, float b)
+{
+    return make_float2(a.x / b, a.y / b);
+}
+inline __host__ __device__ void operator/=(float2 &a, float b)
+{
+    a.x /= b;
+    a.y /= b;
+}
+inline __host__ __device__ float2 operator/(float b, float2 a)
+{
+    return make_float2(b / a.x, b / a.y);
+}
+
+inline __host__ __device__ float3 operator/(float3 a, float3 b)
+{
+    return make_float3(a.x / b.x, a.y / b.y, a.z / b.z);
+}
+inline __host__ __device__ void operator/=(float3 &a, float3 b)
+{
+    a.x /= b.x;
+    a.y /= b.y;
+    a.z /= b.z;
+}
+inline __host__ __device__ float3 operator/(float3 a, float b)
+{
+    return make_float3(a.x / b, a.y / b, a.z / b);
+}
+inline __host__ __device__ void operator/=(float3 &a, float b)
+{
+    a.x /= b;
+    a.y /= b;
+    a.z /= b;
+}
+inline __host__ __device__ float3 operator/(float b, float3 a)
+{
+    return make_float3(b / a.x, b / a.y, b / a.z);
+}
+
+inline __host__ __device__ float4 operator/(float4 a, float4 b)
+{
+    return make_float4(a.x / b.x, a.y / b.y, a.z / b.z,  a.w / b.w);
+}
+inline __host__ __device__ void operator/=(float4 &a, float4 b)
+{
+    a.x /= b.x;
+    a.y /= b.y;
+    a.z /= b.z;
+    a.w /= b.w;
+}
+inline __host__ __device__ float4 operator/(float4 a, float b)
+{
+    return make_float4(a.x / b, a.y / b, a.z / b,  a.w / b);
+}
+inline __host__ __device__ void operator/=(float4 &a, float b)
+{
+    a.x /= b;
+    a.y /= b;
+    a.z /= b;
+    a.w /= b;
+}
+inline __host__ __device__ float4 operator/(float b, float4 a)
+{
+    return make_float4(b / a.x, b / a.y, b / a.z, b / a.w);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// min
+////////////////////////////////////////////////////////////////////////////////
+
+inline  __host__ __device__ float2 fminf(float2 a, float2 b)
+{
+    return make_float2(fminf(a.x,b.x), fminf(a.y,b.y));
+}
+inline __host__ __device__ float3 fminf(float3 a, float3 b)
+{
+    return make_float3(fminf(a.x,b.x), fminf(a.y,b.y), fminf(a.z,b.z));
+}
+inline  __host__ __device__ float4 fminf(float4 a, float4 b)
+{
+    return make_float4(fminf(a.x,b.x), fminf(a.y,b.y), fminf(a.z,b.z), fminf(a.w,b.w));
+}
+
+inline __host__ __device__ int2 min(int2 a, int2 b)
+{
+    return make_int2(min(a.x,b.x), min(a.y,b.y));
+}
+inline __host__ __device__ int3 min(int3 a, int3 b)
+{
+    return make_int3(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z));
+}
+inline __host__ __device__ int4 min(int4 a, int4 b)
+{
+    return make_int4(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z), min(a.w,b.w));
+}
+
+inline __host__ __device__ uint2 min(uint2 a, uint2 b)
+{
+    return make_uint2(min(a.x,b.x), min(a.y,b.y));
+}
+inline __host__ __device__ uint3 min(uint3 a, uint3 b)
+{
+    return make_uint3(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z));
+}
+inline __host__ __device__ uint4 min(uint4 a, uint4 b)
+{
+    return make_uint4(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z), min(a.w,b.w));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// max
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float2 fmaxf(float2 a, float2 b)
+{
+    return make_float2(fmaxf(a.x,b.x), fmaxf(a.y,b.y));
+}
+inline __host__ __device__ float3 fmaxf(float3 a, float3 b)
+{
+    return make_float3(fmaxf(a.x,b.x), fmaxf(a.y,b.y), fmaxf(a.z,b.z));
+}
+inline __host__ __device__ float4 fmaxf(float4 a, float4 b)
+{
+    return make_float4(fmaxf(a.x,b.x), fmaxf(a.y,b.y), fmaxf(a.z,b.z), fmaxf(a.w,b.w));
+}
+
+inline __host__ __device__ int2 max(int2 a, int2 b)
+{
+    return make_int2(max(a.x,b.x), max(a.y,b.y));
+}
+inline __host__ __device__ int3 max(int3 a, int3 b)
+{
+    return make_int3(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z));
+}
+inline __host__ __device__ int4 max(int4 a, int4 b)
+{
+    return make_int4(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z), max(a.w,b.w));
+}
+
+inline __host__ __device__ uint2 max(uint2 a, uint2 b)
+{
+    return make_uint2(max(a.x,b.x), max(a.y,b.y));
+}
+inline __host__ __device__ uint3 max(uint3 a, uint3 b)
+{
+    return make_uint3(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z));
+}
+inline __host__ __device__ uint4 max(uint4 a, uint4 b)
+{
+    return make_uint4(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z), max(a.w,b.w));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// lerp
+// - linear interpolation between a and b, based on value t in [0, 1] range
+////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ __host__ float lerp(float a, float b, float t)
+{
+    return a + t*(b-a);
+}
+inline __device__ __host__ float2 lerp(float2 a, float2 b, float t)
+{
+    return a + t*(b-a);
+}
+inline __device__ __host__ float3 lerp(float3 a, float3 b, float t)
+{
+    return a + t*(b-a);
+}
+inline __device__ __host__ float4 lerp(float4 a, float4 b, float t)
+{
+    return a + t*(b-a);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// clamp
+// - clamp the value v to be in the range [a, b]
+////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ __host__ float clamp(float f, float a, float b)
+{
+    return fmaxf(a, fminf(f, b));
+}
+inline __device__ __host__ int clamp(int f, int a, int b)
+{
+    return max(a, min(f, b));
+}
+inline __device__ __host__ uint clamp(uint f, uint a, uint b)
+{
+    return max(a, min(f, b));
+}
+
+inline __device__ __host__ float2 clamp(float2 v, float a, float b)
+{
+    return make_float2(clamp(v.x, a, b), clamp(v.y, a, b));
+}
+inline __device__ __host__ float2 clamp(float2 v, float2 a, float2 b)
+{
+    return make_float2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y));
+}
+inline __device__ __host__ float3 clamp(float3 v, float a, float b)
+{
+    return make_float3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
+}
+inline __device__ __host__ float3 clamp(float3 v, float3 a, float3 b)
+{
+    return make_float3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
+}
+inline __device__ __host__ float4 clamp(float4 v, float a, float b)
+{
+    return make_float4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b));
+}
+inline __device__ __host__ float4 clamp(float4 v, float4 a, float4 b)
+{
+    return make_float4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w));
+}
+
+inline __device__ __host__ int2 clamp(int2 v, int a, int b)
+{
+    return make_int2(clamp(v.x, a, b), clamp(v.y, a, b));
+}
+inline __device__ __host__ int2 clamp(int2 v, int2 a, int2 b)
+{
+    return make_int2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y));
+}
+inline __device__ __host__ int3 clamp(int3 v, int a, int b)
+{
+    return make_int3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
+}
+inline __device__ __host__ int3 clamp(int3 v, int3 a, int3 b)
+{
+    return make_int3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
+}
+inline __device__ __host__ int4 clamp(int4 v, int a, int b)
+{
+    return make_int4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b));
+}
+inline __device__ __host__ int4 clamp(int4 v, int4 a, int4 b)
+{
+    return make_int4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w));
+}
+
+inline __device__ __host__ uint2 clamp(uint2 v, uint a, uint b)
+{
+    return make_uint2(clamp(v.x, a, b), clamp(v.y, a, b));
+}
+inline __device__ __host__ uint2 clamp(uint2 v, uint2 a, uint2 b)
+{
+    return make_uint2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y));
+}
+inline __device__ __host__ uint3 clamp(uint3 v, uint a, uint b)
+{
+    return make_uint3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
+}
+inline __device__ __host__ uint3 clamp(uint3 v, uint3 a, uint3 b)
+{
+    return make_uint3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
+}
+inline __device__ __host__ uint4 clamp(uint4 v, uint a, uint b)
+{
+    return make_uint4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b));
+}
+inline __device__ __host__ uint4 clamp(uint4 v, uint4 a, uint4 b)
+{
+    return make_uint4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// dot product
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float dot(float2 a, float2 b)
+{
+    return a.x * b.x + a.y * b.y;
+}
+inline __host__ __device__ float dot(float3 a, float3 b)
+{
+    return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+inline __host__ __device__ float dot(float4 a, float4 b)
+{
+    return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
+}
+
+inline __host__ __device__ int dot(int2 a, int2 b)
+{
+    return a.x * b.x + a.y * b.y;
+}
+inline __host__ __device__ int dot(int3 a, int3 b)
+{
+    return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+inline __host__ __device__ int dot(int4 a, int4 b)
+{
+    return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
+}
+
+inline __host__ __device__ uint dot(uint2 a, uint2 b)
+{
+    return a.x * b.x + a.y * b.y;
+}
+inline __host__ __device__ uint dot(uint3 a, uint3 b)
+{
+    return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+inline __host__ __device__ uint dot(uint4 a, uint4 b)
+{
+    return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// length
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float length(float2 v)
+{
+    return sqrtf(dot(v, v));
+}
+inline __host__ __device__ float length(float3 v)
+{
+    return sqrtf(dot(v, v));
+}
+inline __host__ __device__ float length(float4 v)
+{
+    return sqrtf(dot(v, v));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// normalize
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float2 normalize(float2 v)
+{
+    float invLen = rsqrtf(dot(v, v));
+    return v * invLen;
+}
+inline __host__ __device__ float3 normalize(float3 v)
+{
+    float invLen = rsqrtf(dot(v, v));
+    return v * invLen;
+}
+inline __host__ __device__ float4 normalize(float4 v)
+{
+    float invLen = rsqrtf(dot(v, v));
+    return v * invLen;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// floor
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float2 floorf(float2 v)
+{
+    return make_float2(floorf(v.x), floorf(v.y));
+}
+inline __host__ __device__ float3 floorf(float3 v)
+{
+    return make_float3(floorf(v.x), floorf(v.y), floorf(v.z));
+}
+inline __host__ __device__ float4 floorf(float4 v)
+{
+    return make_float4(floorf(v.x), floorf(v.y), floorf(v.z), floorf(v.w));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// frac - returns the fractional portion of a scalar or each vector component
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float fracf(float v)
+{
+    return v - floorf(v);
+}
+inline __host__ __device__ float2 fracf(float2 v)
+{
+    return make_float2(fracf(v.x), fracf(v.y));
+}
+inline __host__ __device__ float3 fracf(float3 v)
+{
+    return make_float3(fracf(v.x), fracf(v.y), fracf(v.z));
+}
+inline __host__ __device__ float4 fracf(float4 v)
+{
+    return make_float4(fracf(v.x), fracf(v.y), fracf(v.z), fracf(v.w));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// fmod
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float2 fmodf(float2 a, float2 b)
+{
+    return make_float2(fmodf(a.x, b.x), fmodf(a.y, b.y));
+}
+inline __host__ __device__ float3 fmodf(float3 a, float3 b)
+{
+    return make_float3(fmodf(a.x, b.x), fmodf(a.y, b.y), fmodf(a.z, b.z));
+}
+inline __host__ __device__ float4 fmodf(float4 a, float4 b)
+{
+    return make_float4(fmodf(a.x, b.x), fmodf(a.y, b.y), fmodf(a.z, b.z), fmodf(a.w, b.w));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// absolute value
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float2 fabs(float2 v)
+{
+    return make_float2(fabs(v.x), fabs(v.y));
+}
+inline __host__ __device__ float3 fabs(float3 v)
+{
+    return make_float3(fabs(v.x), fabs(v.y), fabs(v.z));
+}
+inline __host__ __device__ float4 fabs(float4 v)
+{
+    return make_float4(fabs(v.x), fabs(v.y), fabs(v.z), fabs(v.w));
+}
+
+inline __host__ __device__ int2 abs(int2 v)
+{
+    return make_int2(abs(v.x), abs(v.y));
+}
+inline __host__ __device__ int3 abs(int3 v)
+{
+    return make_int3(abs(v.x), abs(v.y), abs(v.z));
+}
+inline __host__ __device__ int4 abs(int4 v)
+{
+    return make_int4(abs(v.x), abs(v.y), abs(v.z), abs(v.w));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// reflect
+// - returns reflection of incident ray I around surface normal N
+// - N should be normalized, reflected vector's length is equal to length of I
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float3 reflect(float3 i, float3 n)
+{
+    return i - 2.0f * n * dot(n,i);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// cross product
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float3 cross(float3 a, float3 b)
+{
+    return make_float3(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// smoothstep
+// - returns 0 if x < a
+// - returns 1 if x > b
+// - otherwise returns smooth interpolation between 0 and 1 based on x
+////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ __host__ float smoothstep(float a, float b, float x)
+{
+    float y = clamp((x - a) / (b - a), 0.0f, 1.0f);
+    return (y*y*(3.0f - (2.0f*y)));
+}
+inline __device__ __host__ float2 smoothstep(float2 a, float2 b, float2 x)
+{
+    float2 y = clamp((x - a) / (b - a), 0.0f, 1.0f);
+    return (y*y*(make_float2(3.0f) - (make_float2(2.0f)*y)));
+}
+inline __device__ __host__ float3 smoothstep(float3 a, float3 b, float3 x)
+{
+    float3 y = clamp((x - a) / (b - a), 0.0f, 1.0f);
+    return (y*y*(make_float3(3.0f) - (make_float3(2.0f)*y)));
+}
+inline __device__ __host__ float4 smoothstep(float4 a, float4 b, float4 x)
+{
+    float4 y = clamp((x - a) / (b - a), 0.0f, 1.0f);
+    return (y*y*(make_float4(3.0f) - (make_float4(2.0f)*y)));
+}
+
+#endif