From fed329568ca44993d9cc0a8fe145e73a332fb111 Mon Sep 17 00:00:00 2001 From: loki-47-6F-64 Date: Sun, 19 Sep 2021 20:40:34 +0200 Subject: [PATCH] Use an actual cuda kernel to convert RGB to NV12 --- CMakeLists.txt | 39 +- sunshine/platform/linux/cuda.cpp | 212 +++- sunshine/platform/linux/cuda.cu | 248 ++++ sunshine/platform/linux/cuda.h | 46 + sunshine/video.cpp | 26 +- third-party/nvfbc/NvFBC.h | 2006 ++++++++++++++++++++++++++++++ third-party/nvfbc/helper_math.h | 1469 ++++++++++++++++++++++ 7 files changed, 4007 insertions(+), 39 deletions(-) create mode 100644 sunshine/platform/linux/cuda.cu create mode 100644 third-party/nvfbc/NvFBC.h create mode 100644 third-party/nvfbc/helper_math.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 1b04f96f..df0424d5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,15 +4,6 @@ project(Sunshine) set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake) -add_subdirectory(third-party/Simple-Web-Server) - -set(UPNPC_BUILD_SHARED OFF CACHE BOOL "no shared libraries") -set(UPNPC_BUILD_TESTS OFF CACHE BOOL "Don't build tests for miniupnpc") -set(UPNPC_BUILD_SAMPLE OFF CACHE BOOL "Don't build samples for miniupnpc") -set(UPNPC_NO_INSTALL ON CACHE BOOL "Don't install any libraries build for miniupnpc") -add_subdirectory(third-party/miniupnp/miniupnpc) -include_directories(third-party/miniupnp) - if(WIN32) # Ugly hack to compile with #include add_compile_definitions( @@ -21,9 +12,20 @@ if(WIN32) QOS_NON_ADAPTIVE_FLOW=2) endif() add_subdirectory(third-party/moonlight-common-c/enet) +add_subdirectory(third-party/Simple-Web-Server) +add_subdirectory(third-party/cbs) + +set(UPNPC_BUILD_SHARED OFF CACHE BOOL "no shared libraries") +set(UPNPC_BUILD_TESTS OFF CACHE BOOL "Don't build tests for miniupnpc") +set(UPNPC_BUILD_SAMPLE OFF CACHE BOOL "Don't build samples for miniupnpc") +set(UPNPC_NO_INSTALL ON CACHE BOOL "Don't install any libraries build for miniupnpc") +add_subdirectory(third-party/miniupnp/miniupnpc) +include_directories(third-party/miniupnp) find_package(Threads REQUIRED) find_package(OpenSSL REQUIRED) +set(Boost_USE_STATIC_LIBS ON) +find_package(Boost COMPONENTS log filesystem REQUIRED) list(APPEND SUNSHINE_COMPILE_OPTIONS -fPIC -Wall -Wno-missing-braces -Wno-maybe-uninitialized -Wno-sign-compare) @@ -106,6 +108,11 @@ else() option(SUNSHINE_ENABLE_X11 "Enable X11 grab if available" ON) option(SUNSHINE_ENABLE_WAYLAND "Enable building wayland specific code" ON) + if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) + set(CMAKE_CUDA_ARCHITECTURES 75) + endif() + enable_language(CUDA) + if(${SUNSHINE_ENABLE_X11}) find_package(X11) else() @@ -188,6 +195,7 @@ else() sunshine/platform/linux/publish.cpp sunshine/platform/linux/vaapi.h sunshine/platform/linux/vaapi.cpp + sunshine/platform/linux/cuda.cu sunshine/platform/linux/cuda.cpp sunshine/platform/linux/cuda.h sunshine/platform/linux/graphics.h @@ -203,7 +211,8 @@ else() third-party/glad/include/EGL/eglplatform.h third-party/glad/include/KHR/khrplatform.h third-party/glad/include/glad/gl.h - third-party/glad/include/glad/egl.h) + third-party/glad/include/glad/egl.h + third-party/nvfbc/NvFBC.h) list(APPEND PLATFORM_LIBRARIES dl @@ -215,7 +224,8 @@ else() include_directories( /usr/include/libevdev-1.0 third-party/nv-codec-headers/include - third-party/glad/include) + third-party/glad/include + third-party/nvfbc) if(NOT DEFINED SUNSHINE_EXECUTABLE_PATH) set(SUNSHINE_EXECUTABLE_PATH "sunshine") @@ -224,11 +234,6 @@ else() configure_file(sunshine.service.in sunshine.service @ONLY) endif() -add_subdirectory(third-party/cbs) - -set(Boost_USE_STATIC_LIBS ON) -find_package(Boost COMPONENTS log filesystem REQUIRED) - set(SUNSHINE_TARGET_FILES third-party/moonlight-common-c/reedsolomon/rs.c third-party/moonlight-common-c/reedsolomon/rs.h @@ -290,7 +295,7 @@ include_directories( string(TOUPPER "x${CMAKE_BUILD_TYPE}" BUILD_TYPE) if("${BUILD_TYPE}" STREQUAL "XDEBUG") - list(APPEND SUNSHINE_COMPILE_OPTIONS -O0 -pedantic -ggdb3) + list(APPEND SUNSHINE_COMPILE_OPTIONS -O0 -ggdb3) if(WIN32) set_source_files_properties(sunshine/nvhttp.cpp PROPERTIES COMPILE_FLAGS -O2) endif() diff --git a/sunshine/platform/linux/cuda.cpp b/sunshine/platform/linux/cuda.cpp index cdcaba57..811293d6 100644 --- a/sunshine/platform/linux/cuda.cpp +++ b/sunshine/platform/linux/cuda.cpp @@ -1,9 +1,4 @@ -#include "cuda.h" -#include "graphics.h" -#include "sunshine/main.h" -#include "sunshine/utility.h" -#include "wayland.h" -#include "x11grab.h" +#include #include extern "C" { @@ -12,6 +7,13 @@ extern "C" { #include } +#include "cuda.h" +#include "graphics.h" +#include "sunshine/main.h" +#include "sunshine/utility.h" +#include "wayland.h" +#include "x11grab.h" + #define SUNSHINE_STRINGVIEW_HELPER(x) x##sv #define SUNSHINE_STRINGVIEW(x) SUNSHINE_STRINGVIEW_HELPER(x) @@ -23,6 +25,13 @@ extern "C" { using namespace std::literals; namespace cuda { +constexpr auto cudaDevAttrMaxThreadsPerBlock = (CUdevice_attribute)1; +constexpr auto cudaDevAttrMaxThreadsPerMultiProcessor = (CUdevice_attribute)39; + +void pass_error(const std::string_view &sv, const char *name, const char *description) { + BOOST_LOG(error) << sv << name << ':' << description; +} + void cff(CudaFunctions *cf) { cuda_free_functions(&cf); } @@ -151,7 +160,7 @@ int init() { return 0; } -class cuda_t : public platf::hwdevice_t { +class opengl_t : public platf::hwdevice_t { public: int init(int in_width, int in_height, platf::x11::xdisplay_t::pointer xdisplay) { if(!cdf) { @@ -273,16 +282,203 @@ public: int width, height; }; +class cuda_t : public platf::hwdevice_t { +public: + ~cuda_t() override { + // sws_t needs to be destroyed while the context is active + if(sws) { + ctx_t ctx { cuda_ctx }; + + sws.reset(); + } + } + + int init(int in_width, int in_height) { + if(!cdf) { + BOOST_LOG(warning) << "cuda not initialized"sv; + return -1; + } + + data = (void *)0x1; + + width = in_width; + height = in_height; + + return 0; + } + + int set_frame(AVFrame *frame) override { + this->hwframe.reset(frame); + this->frame = frame; + + if(((AVHWFramesContext *)frame->hw_frames_ctx->data)->sw_format != AV_PIX_FMT_NV12) { + BOOST_LOG(error) << "cuda::cuda_t doesn't support any format other than AV_PIX_FMT_NV12"sv; + return -1; + } + + if(av_hwframe_get_buffer(frame->hw_frames_ctx, frame, 0)) { + BOOST_LOG(error) << "Couldn't get hwframe for NVENC"sv; + + return -1; + } + + cuda_ctx = ((AVCUDADeviceContext *)((AVHWFramesContext *)frame->hw_frames_ctx->data)->device_ctx->hwctx)->cuda_ctx; + + ctx_t ctx { cuda_ctx }; + sws = sws_t::make(width * 4, height, frame->width, frame->height); + + if(!sws) { + return -1; + } + + return 0; + } + + int convert(platf::img_t &img) override { + ctx_t ctx { cuda_ctx }; + + return sws->load_ram(img) || sws->convert(frame->data[0], frame->data[1], frame->linesize[0], frame->linesize[1]); + } + + void set_colorspace(std::uint32_t colorspace, std::uint32_t color_range) override { + ctx_t ctx { cuda_ctx }; + sws->set_colorspace(colorspace, color_range); + } + + frame_t hwframe; + + std::unique_ptr sws; + + int width, height; + + CUcontext cuda_ctx; +}; + std::shared_ptr make_hwdevice(int width, int height, platf::x11::xdisplay_t::pointer xdisplay) { if(init()) { return nullptr; } auto cuda = std::make_shared(); - if(cuda->init(width, height, xdisplay)) { + if(cuda->init(width, height)) { return nullptr; } return cuda; } } // namespace cuda + +namespace platf::nvfbc { +static PNVFBCCREATEINSTANCE createInstance {}; +static NVFBC_API_FUNCTION_LIST func { NVFBC_VERSION }; + +static void *handle { nullptr }; +int init() { + static bool funcs_loaded = false; + + if(funcs_loaded) return 0; + + if(!handle) { + handle = dyn::handle({ "libnvidia-fbc.so.1", "libnvidia-fbc.so" }); + if(!handle) { + return -1; + } + } + + std::vector> funcs { + { (dyn::apiproc *)&createInstance, "NvFBCCreateInstance" }, + }; + + if(dyn::load(handle, funcs)) { + dlclose(handle); + handle = nullptr; + + return -1; + } + + funcs_loaded = true; + return 0; +} + +class handle_t { + KITTY_USING_MOVE_T(session_t, NVFBC_SESSION_HANDLE, std::numeric_limits::max(), { + if(el == std::numeric_limits::max()) { + return; + } + NVFBC_DESTROY_HANDLE_PARAMS params { NVFBC_DESTROY_HANDLE_PARAMS_VER }; + + auto status = func.nvFBCDestroyHandle(el, ¶ms); + if(status) { + BOOST_LOG(error) << "Failed to destroy nvfbc handle: "sv << func.nvFBCGetLastErrorStr(el); + } + }); + +public: + static std::optional make() { + NVFBC_CREATE_HANDLE_PARAMS params { NVFBC_CREATE_HANDLE_PARAMS_VER }; + session_t session; + + auto status = func.nvFBCCreateHandle(&session.el, ¶ms); + if(status) { + BOOST_LOG(error) << "Failed to create session: "sv << func.nvFBCGetLastErrorStr(session.el); + session.release(); + + return std::nullopt; + } + + return handle_t { std::move(session) }; + } + + const char *last_error() { + return func.nvFBCGetLastErrorStr(session.el); + } + + std::optional status() { + NVFBC_GET_STATUS_PARAMS params { NVFBC_GET_STATUS_PARAMS_VER }; + + auto status = func.nvFBCGetStatus(session.el, ¶ms); + if(status) { + BOOST_LOG(error) << "Failed to create session: "sv << last_error(); + + return std::nullopt; + } + + return params; + } + + session_t session; +}; + +std::vector nvfbc_display_names() { + if(init()) { + return {}; + } + + std::vector display_names; + + auto status = createInstance(&func); + if(status) { + BOOST_LOG(error) << "Unable to create NvFBC instance"sv; + return {}; + } + + auto handle = handle_t::make(); + if(!handle) { + return {}; + } + + auto status_params = handle->status(); + if(!status_params) { + return {}; + } + + if(!status_params->bIsCapturePossible) { + BOOST_LOG(error) << "NVidia driver doesn't support NvFBC screencasting"sv; + } + + BOOST_LOG(info) << "Found ["sv << status_params->dwOutputNum << "] outputs"sv; + BOOST_LOG(info) << "Virtual Desktop: "sv << status_params->screenSize.w << 'x' << status_params->screenSize.h; + + return display_names; +} +} // namespace platf::nvfbc \ No newline at end of file diff --git a/sunshine/platform/linux/cuda.cu b/sunshine/platform/linux/cuda.cu new file mode 100644 index 00000000..a2ca6508 --- /dev/null +++ b/sunshine/platform/linux/cuda.cu @@ -0,0 +1,248 @@ +// #include +#include +#include +#include +#include +#include + +#include "cuda.h" + +using namespace std::literals; + +#define SUNSHINE_STRINGVIEW_HELPER(x) x##sv +#define SUNSHINE_STRINGVIEW(x) SUNSHINE_STRINGVIEW_HELPER(x) + +#define CU_CHECK(x, y) \ + if(check((x), SUNSHINE_STRINGVIEW(y ": "))) return -1 + +#define CU_CHECK_VOID(x, y) \ + if(check((x), SUNSHINE_STRINGVIEW(y ": "))) return; + +#define CU_CHECK_PTR(x, y) \ + if(check((x), SUNSHINE_STRINGVIEW(y ": "))) return nullptr; + +#define CU_CHECK_IGNORE(x, y) \ + check((x), SUNSHINE_STRINGVIEW(y ": ")) + +using namespace std::literals; + +//////////////////// Special desclarations +/** + * NVCC segfaults when including + * Therefore, some declarations need to be added explicitely + */ +namespace platf { +struct img_t { +public: + std::uint8_t *data {}; + std::int32_t width {}; + std::int32_t height {}; + std::int32_t pixel_pitch {}; + std::int32_t row_pitch {}; + + virtual ~img_t() = default; +}; +} // namespace platf + +namespace video { +using __float4 = float[4]; +using __float3 = float[3]; +using __float2 = float[2]; + +struct __attribute__((__aligned__(16))) color_t { + float4 color_vec_y; + float4 color_vec_u; + float4 color_vec_v; + float2 range_y; + float2 range_uv; +}; + +struct __attribute__((__aligned__(16))) color_extern_t { + __float4 color_vec_y; + __float4 color_vec_u; + __float4 color_vec_v; + __float2 range_y; + __float2 range_uv; +}; + +extern color_extern_t colors[4]; +} // namespace video + +//////////////////// End special declarations + +namespace cuda { +auto constexpr INVALID_TEXTURE = std::numeric_limits::max(); + +template +inline T div_align(T l, T r) { + return (l + r - 1) / r; +} + +void pass_error(const std::string_view &sv, const char *name, const char *description); +inline static int check(cudaError_t result, const std::string_view &sv) { + if(result) { + auto name = cudaGetErrorName(result); + auto description = cudaGetErrorString(result); + + pass_error(sv, name, description); + return -1; + } + + return 0; +} + +__device__ __constant__ video::color_t color; + + +inline __device__ float3 bgra_to_rgb(uchar4 vec) { + return make_float3((float)vec.z, (float)vec.y, (float)vec.x); +} + +inline __device__ float2 calcUV(float3 pixel) { + float4 vec_u = color.color_vec_u; + float4 vec_v = color.color_vec_v; + + float u = dot(pixel, make_float3(vec_u)) + vec_u.w; + float v = dot(pixel, make_float3(vec_v)) + vec_v.w; + + u = u * color.range_uv.x + color.range_uv.y; + v = (v * color.range_uv.x + color.range_uv.y) * 224.0f / 256.0f + 0.0625f * 256.0f; + + return make_float2(u, v); +} + +inline __device__ float calcY(float3 pixel) { + float4 vec_y = color.color_vec_y; + + return (dot(pixel, make_float3(vec_y)) + vec_y.w) * color.range_y.x + color.range_y.y; +} + +__global__ void RGBA_to_NV12( + cudaTextureObject_t srcImage, std::uint8_t *dstY, std::uint8_t *dstUV, + std::uint32_t dstPitchY, std::uint32_t dstPitchUV, + std::uint32_t width, std::uint32_t height) { + + int idX = (threadIdx.x + blockDim.x * blockIdx.x) * 2; + int idY = (threadIdx.y + blockDim.y * blockIdx.y); + + if(idX >= width) return; + if(idY >= height) return; + + dstY = dstY + idX + idY * dstPitchY; + dstUV = dstUV + idX + (idY / 2 * dstPitchUV); + + float x = (float)idX / (float)width / 4; + float y = (float)idY / (float)height; + + float3 rgb_l = bgra_to_rgb(tex2D(srcImage, x, y)); + float3 rgb_r = bgra_to_rgb(tex2D(srcImage, x + 0.25f / width, y + 1.0f / height)); + + float2 uv = calcUV((rgb_l + rgb_r) * 0.5f); + + dstUV[0] = uv.x; + dstUV[1] = uv.y; + dstY[0] = calcY(rgb_l); + dstY[1] = calcY(rgb_r); +} + +sws_t::sws_t(int in_width, int in_height, int out_width, int out_height, int threadsPerBlock) + : array {}, texture { INVALID_TEXTURE }, width { out_width }, height { out_height }, threadsPerBlock { threadsPerBlock } { + auto format = cudaCreateChannelDesc(); + + CU_CHECK_VOID(cudaMallocArray(&array, &format, in_width, in_height, cudaArrayDefault), "Couldn't allocate cuda array"); + + cudaResourceDesc res {}; + res.resType = cudaResourceTypeArray; + res.res.array.array = array; + + cudaTextureDesc desc {}; + + desc.readMode = cudaReadModeElementType; + desc.filterMode = cudaFilterModePoint; + desc.normalizedCoords = true; + + std::fill_n(std::begin(desc.addressMode), 2, cudaAddressModeClamp); + + CU_CHECK_VOID(cudaCreateTextureObject(&texture, &res, &desc, nullptr), "Couldn't create cuda texture"); +} + +sws_t::~sws_t() { + if(texture != INVALID_TEXTURE) { + CU_CHECK_IGNORE(cudaDestroyTextureObject(texture), "Couldn't deallocate cuda texture"); + + texture = INVALID_TEXTURE; + } + + if(array) { + CU_CHECK_IGNORE(cudaFreeArray(array), "Couldn't deallocate cuda array"); + + array = cudaArray_t {}; + } +} + +std::unique_ptr sws_t::make(int in_width, int in_height, int out_width, int out_height) { + cudaDeviceProp props; + int device; + CU_CHECK_PTR(cudaGetDevice(&device), "Couldn't get cuda device"); + CU_CHECK_PTR(cudaGetDeviceProperties(&props, device), "Couldn't get cuda device properties"); + + auto sws = std::make_unique(in_width, in_height, out_width, out_height, props.maxThreadsPerMultiProcessor / props.maxBlocksPerMultiProcessor / 2); + + if(sws->texture == INVALID_TEXTURE) { + return nullptr; + } + + return sws; +} + +int sws_t::convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV) { + int threadsX = width / 2; + int threadsY = height; + + dim3 block(threadsPerBlock, threadsPerBlock); + dim3 grid(div_align(threadsX, threadsPerBlock), div_align(threadsY, threadsPerBlock)); + + RGBA_to_NV12<<>>(texture, Y, UV, pitchY, pitchUV, width, height); + + return CU_CHECK_IGNORE(cudaGetLastError(), "RGBA_to_NV12 failed"); +} + +void sws_t::set_colorspace(std::uint32_t colorspace, std::uint32_t color_range) { + color_range = 1; + colorspace = 5; + video::color_extern_t *color_p; + switch(colorspace) { + case 5: // SWS_CS_SMPTE170M + color_p = &video::colors[0]; + break; + case 1: // SWS_CS_ITU709 + color_p = &video::colors[2]; + break; + case 9: // SWS_CS_BT2020 + default: + color_p = &video::colors[0]; + }; + + if(color_range > 1) { + // Full range + ++color_p; + } + + auto color_matrix = *(video::color_t*)color_p; + color_matrix.color_vec_y.w *= 256.0f; + color_matrix.color_vec_u.w *= 256.0f; + color_matrix.color_vec_v.w *= 256.0f; + + color_matrix.range_y.y *= 256.0f; + color_matrix.range_uv.y *= 256.0f; + + static_assert(sizeof(video::color_t) == sizeof(video::color_extern_t), "color matrix struct mismatch"); + + CU_CHECK_IGNORE(cudaMemcpyToSymbol(color, &color_matrix, sizeof(video::color_t)), "Couldn't copy color matrix to cuda"); +} + +int sws_t::load_ram(platf::img_t &img) { + return CU_CHECK_IGNORE(cudaMemcpy2DToArray(array, 0, 0, img.data, img.row_pitch, img.width * img.pixel_pitch, img.height, cudaMemcpyHostToDevice), "Couldn't copy to cuda array"); +} + +} // namespace cuda \ No newline at end of file diff --git a/sunshine/platform/linux/cuda.h b/sunshine/platform/linux/cuda.h index a56c961b..41087506 100644 --- a/sunshine/platform/linux/cuda.h +++ b/sunshine/platform/linux/cuda.h @@ -1,6 +1,8 @@ #ifndef SUNSHINE_PLATFORM_CUDA_H #define SUNSHINE_PLATFORM_CUDA_H +#ifndef __NVCC__ + #include "sunshine/platform/common.h" #include "x11grab.h" @@ -9,4 +11,48 @@ std::shared_ptr make_hwdevice(int width, int height, platf::x int init(); } // namespace cuda +#else +namespace platf { +class img_t; +} +#endif + +typedef struct cudaArray *cudaArray_t; + +#if !defined(__CUDACC__) +typedef unsigned long long cudaTextureObject_t; +#else /* defined(__CUDACC__) */ +typedef __location__(device_builtin) unsigned long long cudaTextureObject_t; +#endif /* !defined(__CUDACC__) */ + +namespace cuda { +class sws_t { +public: + ~sws_t(); + sws_t(int in_width, int in_height, int out_width, int out_height, int threadsPerBlock); + + /** + * in_width, out_width -- The width and height of the captured image in bytes + * out_width, out_height -- the width and height of the NV12 image in pixels + * + * cuda_device -- pointer to the cuda device + */ + static std::unique_ptr make(int in_width, int in_height, int out_width, int out_height); + + // Converts loaded image into a CUDevicePtr + int convert(std::uint8_t *Y, std::uint8_t *UV, std::uint32_t pitchY, std::uint32_t pitchUV); + + void set_colorspace(std::uint32_t colorspace, std::uint32_t color_range); + + int load_ram(platf::img_t &img); + + cudaArray_t array; + cudaTextureObject_t texture; + + int width, height; + + int threadsPerBlock; +}; +} // namespace cuda + #endif \ No newline at end of file diff --git a/sunshine/video.cpp b/sunshine/video.cpp index 7205aa07..3b143cbe 100644 --- a/sunshine/video.cpp +++ b/sunshine/video.cpp @@ -324,7 +324,7 @@ struct encoder_t { class session_t { public: session_t() = default; - session_t(ctx_t &&ctx, util::wrap_ptr &&device, int inject) : ctx { std::move(ctx) }, device { std::move(device) }, inject { inject } {} + session_t(ctx_t &&ctx, std::shared_ptr &&device, int inject) : ctx { std::move(ctx) }, device { std::move(device) }, inject { inject } {} session_t(session_t &&other) noexcept = default; @@ -342,7 +342,7 @@ public: } ctx_t ctx; - util::wrap_ptr device; + std::shared_ptr device; std::vector replacements; @@ -369,7 +369,6 @@ struct sync_session_t { sync_session_ctx_t *ctx; platf::img_t *img_tmp; - std::shared_ptr hwdevice; session_t session; }; @@ -779,7 +778,7 @@ int encode(int64_t frame_nr, session_t &session, frame_t::pointer frame, safe::m return 0; } -std::optional make_session(const encoder_t &encoder, const config_t &config, int width, int height, platf::hwdevice_t *hwdevice) { +std::optional make_session(const encoder_t &encoder, const config_t &config, int width, int height, std::shared_ptr &&hwdevice) { bool hardware = encoder.dev_type != AV_HWDEVICE_TYPE_NONE; auto &video_format = config.videoFormat == 0 ? encoder.h264 : encoder.hevc; @@ -886,7 +885,7 @@ std::optional make_session(const encoder_t &encoder, const config_t & if(hardware) { ctx->pix_fmt = encoder.dev_pix_fmt; - auto buf_or_error = encoder.make_hwdevice_ctx(hwdevice); + auto buf_or_error = encoder.make_hwdevice_ctx(hwdevice.get()); if(buf_or_error.has_right()) { return std::nullopt; } @@ -965,7 +964,7 @@ std::optional make_session(const encoder_t &encoder, const config_t & frame->hw_frames_ctx = av_buffer_ref(ctx->hw_frames_ctx); } - util::wrap_ptr device; + std::shared_ptr device; if(!hwdevice->data) { auto device_tmp = std::make_unique(); @@ -977,7 +976,7 @@ std::optional make_session(const encoder_t &encoder, const config_t & device = std::move(device_tmp); } else { - device = hwdevice; + device = std::move(hwdevice); } if(device->set_frame(frame.release())) { @@ -1009,12 +1008,12 @@ void encode_run( img_event_t images, config_t config, int width, int height, - platf::hwdevice_t *hwdevice, + std::shared_ptr &&hwdevice, safe::signal_t &reinit_event, const encoder_t &encoder, void *channel_data) { - auto session = make_session(encoder, config, width, height, hwdevice); + auto session = make_session(encoder, config, width, height, std::move(hwdevice)); if(!session) { return; } @@ -1101,12 +1100,11 @@ std::optional make_synced_session(platf::display_t *disp, const // absolute mouse coordinates require that the dimensions of the screen are known ctx.touch_port_events->raise(make_port(disp, ctx.config)); - auto session = make_session(encoder, ctx.config, img.width, img.height, hwdevice.get()); + auto session = make_session(encoder, ctx.config, img.width, img.height, std::move(hwdevice)); if(!session) { return std::nullopt; } - encode_session.hwdevice = std::move(hwdevice); encode_session.session = std::move(*session); return std::move(encode_session); @@ -1208,7 +1206,7 @@ encode_e encode_run_sync( ctx->idr_events->pop(); } - if(pos->hwdevice->convert(*img)) { + if(pos->session.device->convert(*img)) { BOOST_LOG(error) << "Could not convert image"sv; ctx->shutdown_event->raise(true); @@ -1356,7 +1354,7 @@ void capture_async( frame_nr, mail, images, config, display->width, display->height, - hwdevice.get(), + std::move(hwdevice), ref->reinit_event, *ref->encoder_p, channel_data); } @@ -1409,7 +1407,7 @@ int validate_config(std::shared_ptr &disp, const encoder_t &en return -1; } - auto session = make_session(encoder, config, disp->width, disp->height, hwdevice.get()); + auto session = make_session(encoder, config, disp->width, disp->height, std::move(hwdevice)); if(!session) { return -1; } diff --git a/third-party/nvfbc/NvFBC.h b/third-party/nvfbc/NvFBC.h new file mode 100644 index 00000000..8990eeab --- /dev/null +++ b/third-party/nvfbc/NvFBC.h @@ -0,0 +1,2006 @@ +/*! + * \file + * + * This file contains the interface constants, structure definitions and + * function prototypes defining the NvFBC API for Linux. + * + * Copyright (c) 2013-2020, NVIDIA CORPORATION. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#ifndef _NVFBC_H_ +#define _NVFBC_H_ + +#include + +/*! + * \mainpage NVIDIA Framebuffer Capture (NvFBC) for Linux. + * + * NvFBC is a high performance, low latency API to capture the framebuffer of + * an X server screen. + * + * The output from NvFBC captures everything that would be visible if we were + * directly looking at the monitor. This includes window manager decoration, + * mouse cursor, overlay, etc. + * + * It is ideally suited to desktop or fullscreen application capture and + * remoting. + */ + +/*! + * \defgroup FBC_REQ Requirements + * + * The following requirements are provided by the regular NVIDIA Display Driver + * package: + * + * - OpenGL core >= 4.2: + * Required. NvFBC relies on OpenGL to perform frame capture and + * post-processing. + * + * - Vulkan 1.1: + * Required. + * + * - libcuda.so.1 >= 5.5: + * Optional. Used for capture to video memory with CUDA interop. + * + * The following requirements must be installed separately depending on the + * Linux distribution being used: + * + * - XRandR extension >= 1.2: + * Optional. Used for RandR output tracking. + * + * - libX11-xcb.so.1 >= 1.2: + * Required. NvFBC uses a mix of Xlib and XCB. Xlib is needed to use GLX, + * XCB is needed to make NvFBC more resilient against X server terminations + * while a capture session is active. + * + * - libxcb.so.1 >= 1.3: + * Required. See above. + * + * - xorg-server >= 1.3: + * Optional. Required for push model to work properly. + * + * Note that all optional dependencies are dlopen()'d at runtime. Failure to + * load an optional library is not fatal. + */ + +/*! + * \defgroup FBC_CHANGES ChangeLog + * + * NvFBC Linux API version 0.1 + * - Initial BETA release. + * + * NvFBC Linux API version 0.2 + * - Added 'bEnableMSE' field to NVFBC_H264_HW_ENC_CONFIG. + * - Added 'dwMSE' field to NVFBC_TOH264_GRAB_FRAME_PARAMS. + * - Added 'bEnableAQ' field to NVFBC_H264_HW_ENC_CONFIG. + * - Added 'NVFBC_H264_PRESET_LOSSLESS_HP' enum to NVFBC_H264_PRESET. + * - Added 'NVFBC_BUFFER_FORMAT_YUV444P' enum to NVFBC_BUFFER_FORMAT. + * - Added 'eInputBufferFormat' field to NVFBC_H264_HW_ENC_CONFIG. + * - Added '0' and '244' values for NVFBC_H264_HW_ENC_CONFIG::dwProfile. + * + * NvFBC Linux API version 0.3 + * - Improved multi-threaded support by implementing an API locking mechanism. + * - Added 'nvFBCBindContext' API entry point. + * - Added 'nvFBCReleaseContext' API entry point. + * + * NvFBC Linux API version 1.0 + * - Added codec agnostic interface for HW encoding. + * - Deprecated H.264 interface. + * - Added support for H.265/HEVC HW encoding. + * + * NvFBC Linux API version 1.1 + * - Added 'nvFBCToHwGetCaps' API entry point. + * - Added 'dwDiffMapScalingFactor' field to NVFBC_TOSYS_SETUP_PARAMS. + * + * NvFBC Linux API version 1.2 + * - Deprecated ToHwEnc interface. + * - Added ToGL interface that captures frames to an OpenGL texture in video + * memory. + * - Added 'bDisableAutoModesetRecovery' field to + * NVFBC_CREATE_CAPTURE_SESSION_PARAMS. + * - Added 'bExternallyManagedContext' field to NVFBC_CREATE_HANDLE_PARAMS. + * + * NvFBC Linux API version 1.3 + * - Added NVFBC_BUFFER_FORMAT_RGBA + * - Added 'dwTimeoutMs' field to NVFBC_TOSYS_GRAB_FRAME_PARAMS, + * NVFBC_TOCUDA_GRAB_FRAME_PARAMS, and NVFBC_TOGL_GRAB_FRAME_PARAMS. + * + * NvFBC Linux API version 1.4 + * - Clarified that NVFBC_BUFFER_FORMAT_{ARGB,RGB,RGBA} are byte-order formats. + * - Renamed NVFBC_BUFFER_FORMAT_YUV420P to NVFBC_BUFFER_FORMAT_NV12. + * - Added new requirements. + * - Made NvFBC more resilient against the X server terminating during an active + * capture session. See new comments for ::NVFBC_ERR_X. + * - Relaxed requirement that 'frameSize' must have a width being a multiple of + * 4 and a height being a multiple of 2. + * - Added 'bRoundFrameSize' field to NVFBC_CREATE_CAPTURE_SESSION_PARAMS. + * - Relaxed requirement that the scaling factor for differential maps must be + * a multiple of the size of the frame. + * - Added 'diffMapSize' field to NVFBC_TOSYS_SETUP_PARAMS and + * NVFBC_TOGL_SETUP_PARAMS. + * + * NvFBC Linux API version 1.5 + * - Added NVFBC_BUFFER_FORMAT_BGRA + * + * NvFBC Linux API version 1.6 + * - Added the 'NVFBC_TOSYS_GRAB_FLAGS_NOWAIT_IF_NEW_FRAME_READY', + * 'NVFBC_TOCUDA_GRAB_FLAGS_NOWAIT_IF_NEW_FRAME_READY', and + * 'NVFBC_TOGL_GRAB_FLAGS_NOWAIT_IF_NEW_FRAME_READY' capture flags. + * - Exposed debug and performance logs through the NVFBC_LOG_LEVEL environment + * variable. Setting it to "1" enables performance logs, setting it to "2" + * enables debugging logs, setting it to "3" enables both. + * - Logs are printed to stdout or to the file pointed by the NVFBC_LOG_FILE + * environment variable. + * - Added 'ulTimestampUs' to NVFBC_FRAME_GRAB_INFO. + * - Added 'dwSamplingRateMs' to NVFBC_CREATE_CAPTURE_SESSION_PARAMS. + * - Added 'bPushModel' to NVFBC_CREATE_CAPTURE_SESSION_PARAMS. + * + * NvFBC Linux API version 1.7 + * - Retired the NVFBC_CAPTURE_TO_HW_ENCODER interface. + * This interface has been deprecated since NvFBC 1.2 and has received no + * updates or new features since. We recommend using the NVIDIA Video Codec + * SDK to encode NvFBC frames. + * See: https://developer.nvidia.com/nvidia-video-codec-sdk + * - Added a 'Capture Modes' section to those headers. + * - Added a 'Post Processing' section to those headers. + * - Added an 'Environment Variables' section to those headers. + * - Added 'bInModeset' to NVFBC_GET_STATUS_PARAMS. + * - Added 'bAllowDirectCapture' to NVFBC_CREATE_CAPTURE_SESSION_PARAMS. + * - Added 'bDirectCaptured' to NVFBC_FRAME_GRAB_INFO. + * - Added 'bRequiredPostProcessing' to NVFBC_FRAME_GRAB_INFO. + */ + +/*! + * \defgroup FBC_MODES Capture Modes + * + * When creating a capture session, NvFBC instantiates a capture subsystem + * living in the NVIDIA X driver. + * + * This subsystem listens for damage events coming from applications then + * generates (composites) frames for NvFBC when new content is available. + * + * This capture server can operate on a timer where it periodically checks if + * there are any pending damage events, or it can generate frames as soon as it + * receives a new damage event. + * See NVFBC_CREATE_CAPTURE_SESSION_PARAMS::dwSamplingRateMs, + * and NVFBC_CREATE_CAPTURE_SESSION_PARAMS::bPushModel. + * + * NvFBC can also attach itself to a fullscreen unoccluded application and have + * it copy its frames directly into a buffer owned by NvFBC upon present. This + * mode bypasses the X server. + * See NVFBC_CREATE_CAPTURE_SESSION_PARAMS::bAllowDirectCapture. + * + * NvFBC is designed to capture frames with as few copies as possible. The + * NVIDIA X driver composites frames directly into the NvFBC buffers, and + * direct capture copies frames directly into these buffers as well. + * + * Depending on the configuration of a capture session, an extra copy (rendering + * pass) may be needed. See the 'Post Processing' section. + */ + +/*! + * \defgroup FBC_PP Post Processing + * + * Depending on the configuration of a capture session, NvFBC might require to + * do post processing on frames. + * + * Post processing is required for the following reasons: + * - NvFBC needs to do a pixel format conversion. + * - Diffmaps are requested. + * - Capture to system memory is requested. + * + * NvFBC needs to do a conversion if the requested pixel format does not match + * the native format. The native format is NVFBC_BUFFER_FORMAT_BGRA. + * + * Note: post processing is *not* required for frame scaling and frame cropping. + * + * Skipping post processing can reduce capture latency. An application can know + * whether post processing was required by checking + * NVFBC_FRAME_GRAB_INFO::bRequiredPostProcessing. + */ + +/*! + * \defgroup FBC_ENVVAR Environment Variables + * + * Below are the environment variables supported by NvFBC: + * + * - NVFBC_LOG_LEVEL + * Bitfield where the first bit enables debug logs and the second bit enables + * performance logs. Both can be enabled by setting this envvar to 3. + * + * - NVFBC_LOG_FILE + * Write all NvFBC logs to the given file. + * + * - NVFBC_FORCE_ALLOW_DIRECT_CAPTURE + * Used to override NVFBC_CREATE_CAPTURE_SESSION_PARAMS::bAllowDirectCapture. + * + * - NVFBC_FORCE_POST_PROCESSING + * Used to force the post processing step, even if it could be skipped. + * See the 'Post Processing' section. + */ + +/*! + * \defgroup FBC_STRUCT Structure Definition + * + * @{ + */ + +#ifdef __cplusplus +extern "C" { +#endif + +/*! + * Calling convention. + */ +#define NVFBCAPI + +/*! + * NvFBC API major version. + */ +#define NVFBC_VERSION_MAJOR 1 + +/*! + * NvFBC API minor version. + */ +#define NVFBC_VERSION_MINOR 7 + +/*! + * NvFBC API version. + */ +#define NVFBC_VERSION (uint32_t) (NVFBC_VERSION_MINOR | (NVFBC_VERSION_MAJOR << 8)) + +/*! + * Creates a version number for structure parameters. + */ +#define NVFBC_STRUCT_VERSION(typeName, ver) \ + (uint32_t) (sizeof(typeName) | ((ver) << 16) | (NVFBC_VERSION << 24)) + +/*! + * Defines error codes. + * + * \see NvFBCGetLastErrorStr + */ +typedef enum _NVFBCSTATUS +{ + /*! + * This indicates that the API call returned with no errors. + */ + NVFBC_SUCCESS = 0, + /*! + * This indicates that the API version between the client and the library + * is not compatible. + */ + NVFBC_ERR_API_VERSION = 1, + /*! + * An internal error occurred. + */ + NVFBC_ERR_INTERNAL = 2, + /*! + * This indicates that one or more of the parameter passed to the API call + * is invalid. + */ + NVFBC_ERR_INVALID_PARAM = 3, + /*! + * This indicates that one or more of the pointers passed to the API call + * is invalid. + */ + NVFBC_ERR_INVALID_PTR = 4, + /*! + * This indicates that the handle passed to the API call to identify the + * client is invalid. + */ + NVFBC_ERR_INVALID_HANDLE = 5, + /*! + * This indicates that the maximum number of threaded clients of the same + * process has been reached. The limit is 10 threads per process. + * There is no limit on the number of process. + */ + NVFBC_ERR_MAX_CLIENTS = 6, + /*! + * This indicates that the requested feature is not currently supported + * by the library. + */ + NVFBC_ERR_UNSUPPORTED = 7, + /*! + * This indicates that the API call failed because it was unable to allocate + * enough memory to perform the requested operation. + */ + NVFBC_ERR_OUT_OF_MEMORY = 8, + /*! + * This indicates that the API call was not expected. This happens when + * API calls are performed in a wrong order, such as trying to capture + * a frame prior to creating a new capture session; or trying to set up + * a capture to video memory although a capture session to system memory + * was created. + */ + NVFBC_ERR_BAD_REQUEST = 9, + /*! + * This indicates an X error, most likely meaning that the X server has + * been terminated. When this error is returned, the only resort is to + * create another FBC handle using NvFBCCreateHandle(). + * + * The previous handle should still be freed with NvFBCDestroyHandle(), but + * it might leak resources, in particular X, GLX, and GL resources since + * it is no longer possible to communicate with an X server to free them + * through the driver. + * + * The best course of action to eliminate this potential leak is to close + * the OpenGL driver, close the forked process running the capture, or + * restart the application. + */ + NVFBC_ERR_X = 10, + /*! + * This indicates a GLX error. + */ + NVFBC_ERR_GLX = 11, + /*! + * This indicates an OpenGL error. + */ + NVFBC_ERR_GL = 12, + /*! + * This indicates a CUDA error. + */ + NVFBC_ERR_CUDA = 13, + /*! + * This indicates a HW encoder error. + */ + NVFBC_ERR_ENCODER = 14, + /*! + * This indicates an NvFBC context error. + */ + NVFBC_ERR_CONTEXT = 15, + /*! + * This indicates that the application must recreate the capture session. + * + * This error can be returned if a modeset event occurred while capturing + * frames, and NVFBC_CREATE_HANDLE_PARAMS::bDisableAutoModesetRecovery + * was set to NVFBC_TRUE. + */ + NVFBC_ERR_MUST_RECREATE = 16, + /*! + * This indicates a Vulkan error. + */ + NVFBC_ERR_VULKAN = 17, +} NVFBCSTATUS; + +/*! + * Defines boolean values. + */ +typedef enum _NVFBC_BOOL +{ + /*! + * False value. + */ + NVFBC_FALSE = 0, + /*! + * True value. + */ + NVFBC_TRUE, +} NVFBC_BOOL; + +/*! + * Maximum size in bytes of an error string. + */ +#define NVFBC_ERR_STR_LEN 512 + +/*! + * Capture type. + */ +typedef enum _NVFBC_CAPTURE_TYPE +{ + /*! + * Capture frames to a buffer in system memory. + */ + NVFBC_CAPTURE_TO_SYS = 0, + /*! + * Capture frames to a CUDA device in video memory. + * + * Specifying this will dlopen() libcuda.so.1 and fail if not available. + */ + NVFBC_CAPTURE_SHARED_CUDA, + /*! + * Retired. Do not use. + */ + /* NVFBC_CAPTURE_TO_HW_ENCODER, */ + /*! + * Capture frames to an OpenGL buffer in video memory. + */ + NVFBC_CAPTURE_TO_GL = 3, +} NVFBC_CAPTURE_TYPE; + +/*! + * Tracking type. + * + * NvFBC can track a specific region of the framebuffer to capture. + * + * An X screen corresponds to the entire framebuffer. + * + * An RandR CRTC is a component of the GPU that reads pixels from a region of + * the X screen and sends them through a pipeline to an RandR output. + * A physical monitor can be connected to an RandR output. Tracking an RandR + * output captures the region of the X screen that the RandR CRTC is sending to + * the RandR output. + */ +typedef enum +{ + /*! + * By default, NvFBC tries to track a connected primary output. If none is + * found, then it tries to track the first connected output. If none is + * found then it tracks the entire X screen. + * + * If the XRandR extension is not available, this option has the same effect + * as ::NVFBC_TRACKING_SCREEN. + * + * This default behavior might be subject to changes in the future. + */ + NVFBC_TRACKING_DEFAULT = 0, + /*! + * Track an RandR output specified by its ID in the appropriate field. + * + * The list of connected outputs can be queried via NvFBCGetStatus(). + * This list can also be obtained using e.g., xrandr(1). + * + * If the XRandR extension is not available, setting this option returns an + * error. + */ + NVFBC_TRACKING_OUTPUT, + /*! + * Track the entire X screen. + */ + NVFBC_TRACKING_SCREEN, +} NVFBC_TRACKING_TYPE; + +/*! + * Buffer format. + */ +typedef enum _NVFBC_BUFFER_FORMAT +{ + /*! + * Data will be converted to ARGB8888 byte-order format. 32 bpp. + */ + NVFBC_BUFFER_FORMAT_ARGB = 0, + /*! + * Data will be converted to RGB888 byte-order format. 24 bpp. + */ + NVFBC_BUFFER_FORMAT_RGB, + /*! + * Data will be converted to NV12 format using HDTV weights + * according to ITU-R BT.709. 12 bpp. + */ + NVFBC_BUFFER_FORMAT_NV12, + /*! + * Data will be converted to YUV 444 planar format using HDTV weights + * according to ITU-R BT.709. 24 bpp + */ + NVFBC_BUFFER_FORMAT_YUV444P, + /*! + * Data will be converted to RGBA8888 byte-order format. 32 bpp. + */ + NVFBC_BUFFER_FORMAT_RGBA, + /*! + * Native format. No pixel conversion needed. + * BGRA8888 byte-order format. 32 bpp. + */ + NVFBC_BUFFER_FORMAT_BGRA, +} NVFBC_BUFFER_FORMAT; + +#define NVFBC_BUFFER_FORMAT_YUV420P NVFBC_BUFFER_FORMAT_NV12 + +/*! + * Handle used to identify an NvFBC session. + */ +typedef uint64_t NVFBC_SESSION_HANDLE; + +/*! + * Box used to describe an area of the tracked region to capture. + * + * The coordinates are relative to the tracked region. + * + * E.g., if the size of the X screen is 3520x1200 and the tracked RandR output + * scans a region of 1600x1200+1920+0, then setting a capture box of + * 800x600+100+50 effectively captures a region of 800x600+2020+50 relative to + * the X screen. + */ +typedef struct _NVFBC_BOX +{ + /*! + * [in] X offset of the box. + */ + uint32_t x; + /*! + * [in] Y offset of the box. + */ + uint32_t y; + /*! + * [in] Width of the box. + */ + uint32_t w; + /*! + * [in] Height of the box. + */ + uint32_t h; +} NVFBC_BOX; + +/*! + * Size used to describe the size of a frame. + */ +typedef struct _NVFBC_SIZE +{ + /*! + * [in] Width. + */ + uint32_t w; + /*! + * [in] Height. + */ + uint32_t h; +} NVFBC_SIZE; + +/*! + * Describes information about a captured frame. + */ +typedef struct _NVFBC_FRAME_GRAB_INFO +{ + /*! + * [out] Width of the captured frame. + */ + uint32_t dwWidth; + /*! + * [out] Height of the captured frame. + */ + uint32_t dwHeight; + /*! + * [out] Size of the frame in bytes. + */ + uint32_t dwByteSize; + /*! + * [out] Incremental ID of the current frame. + * + * This can be used to identify a frame. + */ + uint32_t dwCurrentFrame; + /*! + * [out] Whether the captured frame is a new frame. + * + * When using non blocking calls it is possible to capture a frame + * that was already captured before if the display server did not + * render a new frame in the meantime. In that case, this flag + * will be set to NVFBC_FALSE. + * + * When using blocking calls each captured frame will have + * this flag set to NVFBC_TRUE since the blocking mechanism waits for + * the display server to render a new frame. + * + * Note that this flag does not guarantee that the content of + * the frame will be different compared to the previous captured frame. + * + * In particular, some compositing managers report the entire + * framebuffer as damaged when an application refreshes its content. + * + * Consider a single X screen spanned across physical displays A and B + * and an NvFBC application tracking display A. Depending on the + * compositing manager, it is possible that an application refreshing + * itself on display B will trigger a frame capture on display A. + * + * Workarounds include: + * - Using separate X screens + * - Disabling the composite extension + * - Using a compositing manager that properly reports what regions + * are damaged + * - Using NvFBC's diffmaps to find out if the frame changed + */ + NVFBC_BOOL bIsNewFrame; + /*! + * [out] Frame timestamp + * + * Time in micro seconds when the display server started rendering the + * frame. + * + * This does not account for when the frame was captured. If capturing an + * old frame (e.g., bIsNewFrame is NVFBC_FALSE) the reported timestamp + * will reflect the time when the old frame was rendered by the display + * server. + */ + uint64_t ulTimestampUs; + /* + * [out] Number of frames generated since the last capture. + * + * This can help applications tell whether they missed frames or there + * were no frames generated by the server since the last capture. + */ + uint32_t dwMissedFrames; + /* + * [out] Whether the captured frame required post processing. + * + * See the 'Post Processing' section. + */ + NVFBC_BOOL bRequiredPostProcessing; + /* + * [out] Whether this frame was obtained via direct capture. + * + * See NVFBC_CREATE_CAPTURE_SESSION_PARAMS::bAllowDirectCapture. + */ + NVFBC_BOOL bDirectCapture; +} NVFBC_FRAME_GRAB_INFO; + +/*! + * Defines parameters for the CreateHandle() API call. + */ +typedef struct _NVFBC_CREATE_HANDLE_PARAMS +{ + /*! + * [in] Must be set to NVFBC_CREATE_HANDLE_PARAMS_VER + */ + uint32_t dwVersion; + /*! + * [in] Application specific private information passed to the NvFBC + * session. + */ + const void *privateData; + /*! + * [in] Size of the application specific private information passed to the + * NvFBC session. + */ + uint32_t privateDataSize; + /*! + * [in] Whether NvFBC should not create and manage its own graphics context + * + * NvFBC internally uses OpenGL to perfom graphics operations on the + * captured frames. By default, NvFBC will create and manage (e.g., make + * current, detect new threads, etc.) its own OpenGL context. + * + * If set to NVFBC_TRUE, NvFBC will use the application's context. It will + * be the application's responsibility to make sure that a context is + * current on the thread calling into the NvFBC API. + */ + NVFBC_BOOL bExternallyManagedContext; + /*! + * [in] GLX context + * + * GLX context that NvFBC should use internally to create pixmaps and + * make them current when creating a new capture session. + * + * Note: NvFBC expects a context created against a GLX_RGBA_TYPE render + * type. + */ + void *glxCtx; + /*! + * [in] GLX framebuffer configuration + * + * Framebuffer configuration that was used to create the GLX context, and + * that will be used to create pixmaps internally. + * + * Note: NvFBC expects a configuration having at least the following + * attributes: + * GLX_DRAWABLE_TYPE, GLX_PIXMAP_BIT + * GLX_BIND_TO_TEXTURE_RGBA_EXT, 1 + * GLX_BIND_TO_TEXTURE_TARGETS_EXT, GLX_TEXTURE_2D_BIT_EXT + */ + void *glxFBConfig; +} NVFBC_CREATE_HANDLE_PARAMS; + +/*! + * NVFBC_CREATE_HANDLE_PARAMS structure version. + */ +#define NVFBC_CREATE_HANDLE_PARAMS_VER NVFBC_STRUCT_VERSION(NVFBC_CREATE_HANDLE_PARAMS, 2) + +/*! + * Defines parameters for the ::NvFBCDestroyHandle() API call. + */ +typedef struct _NVFBC_DESTROY_HANDLE_PARAMS +{ + /*! + * [in] Must be set to NVFBC_DESTROY_HANDLE_PARAMS_VER + */ + uint32_t dwVersion; +} NVFBC_DESTROY_HANDLE_PARAMS; + +/*! + * NVFBC_DESTROY_HANDLE_PARAMS structure version. + */ +#define NVFBC_DESTROY_HANDLE_PARAMS_VER NVFBC_STRUCT_VERSION(NVFBC_DESTROY_HANDLE_PARAMS, 1) + +/*! + * Maximum number of connected RandR outputs to an X screen. + */ +#define NVFBC_OUTPUT_MAX 5 + +/*! + * Maximum size in bytes of an RandR output name. + */ +#define NVFBC_OUTPUT_NAME_LEN 128 + +/*! + * Describes an RandR output. + * + * Filling this structure relies on the XRandR extension. This feature cannot + * be used if the extension is missing or its version is below the requirements. + * + * \see Requirements + */ +typedef struct _NVFBC_OUTPUT +{ + /*! + * Identifier of the RandR output. + */ + uint32_t dwId; + /*! + * Name of the RandR output, as reported by tools such as xrandr(1). + * + * Example: "DVI-I-0" + */ + char name[NVFBC_OUTPUT_NAME_LEN]; + /*! + * Region of the X screen tracked by the RandR CRTC driving this RandR + * output. + */ + NVFBC_BOX trackedBox; +} NVFBC_RANDR_OUTPUT_INFO; + +/*! + * Defines parameters for the ::NvFBCGetStatus() API call. + */ +typedef struct _NVFBC_GET_STATUS_PARAMS +{ + /*! + * [in] Must be set to NVFBC_GET_STATUS_PARAMS_VER + */ + uint32_t dwVersion; + /*! + * [out] Whether or not framebuffer capture is supported by the graphics + * driver. + */ + NVFBC_BOOL bIsCapturePossible; + /*! + * [out] Whether or not there is already a capture session on this system. + */ + NVFBC_BOOL bCurrentlyCapturing; + /*! + * [out] Whether or not it is possible to create a capture session on this + * system. + */ + NVFBC_BOOL bCanCreateNow; + /*! + * [out] Size of the X screen (framebuffer). + */ + NVFBC_SIZE screenSize; + /*! + * [out] Whether the XRandR extension is available. + * + * If this extension is not available then it is not possible to have + * information about RandR outputs. + */ + NVFBC_BOOL bXRandRAvailable; + /*! + * [out] Array of outputs connected to the X screen. + * + * An application can track a specific output by specifying its ID when + * creating a capture session. + * + * Only if XRandR is available. + */ + NVFBC_RANDR_OUTPUT_INFO outputs[NVFBC_OUTPUT_MAX]; + /*! + * [out] Number of outputs connected to the X screen. + * + * This must be used to parse the array of connected outputs. + * + * Only if XRandR is available. + */ + uint32_t dwOutputNum; + /*! + * [out] Version of the NvFBC library running on this system. + */ + uint32_t dwNvFBCVersion; + /*! + * [out] Whether the X server is currently in modeset. + * + * When the X server is in modeset, it must give up all its video + * memory allocations. It is not possible to create a capture + * session until the modeset is over. + * + * Note that VT-switches are considered modesets. + */ + NVFBC_BOOL bInModeset; +} NVFBC_GET_STATUS_PARAMS; + +/*! + * NVFBC_GET_STATUS_PARAMS structure version. + */ +#define NVFBC_GET_STATUS_PARAMS_VER NVFBC_STRUCT_VERSION(NVFBC_GET_STATUS_PARAMS, 2) + +/*! + * Defines parameters for the ::NvFBCCreateCaptureSession() API call. + */ +typedef struct _NVFBC_CREATE_CAPTURE_SESSION_PARAMS +{ + /*! + * [in] Must be set to NVFBC_CREATE_CAPTURE_SESSION_PARAMS_VER + */ + uint32_t dwVersion; + /*! + * [in] Desired capture type. + * + * Note that when specyfing ::NVFBC_CAPTURE_SHARED_CUDA NvFBC will try to + * dlopen() the corresponding libraries. This means that NvFBC can run on + * a system without the CUDA library since it does not link against them. + */ + NVFBC_CAPTURE_TYPE eCaptureType; + /*! + * [in] What region of the framebuffer should be tracked. + */ + NVFBC_TRACKING_TYPE eTrackingType; + /*! + * [in] ID of the output to track if eTrackingType is set to + * ::NVFBC_TRACKING_OUTPUT. + */ + uint32_t dwOutputId; + /*! + * [in] Crop the tracked region. + * + * The coordinates are relative to the tracked region. + * + * It can be set to 0 to capture the entire tracked region. + */ + NVFBC_BOX captureBox; + /*! + * [in] Desired size of the captured frame. + * + * This parameter allow to scale the captured frame. + * + * It can be set to 0 to disable frame resizing. + */ + NVFBC_SIZE frameSize; + /*! + * [in] Whether the mouse cursor should be composited to the frame. + * + * Disabling the cursor will not generate new frames when only the cursor + * is moved. + */ + NVFBC_BOOL bWithCursor; + /*! + * [in] Whether NvFBC should not attempt to recover from modesets. + * + * NvFBC is able to detect when a modeset event occured and can automatically + * re-create a capture session with the same settings as before, then resume + * its frame capture session transparently. + * + * This option allows to disable this behavior. NVFBC_ERR_MUST_RECREATE + * will be returned in that case. + * + * It can be useful in the cases when an application needs to do some work + * between setting up a capture and grabbing the first frame. + * + * For example: an application using the ToGL interface needs to register + * resources with EncodeAPI prior to encoding frames. + * + * Note that during modeset recovery, NvFBC will try to re-create the + * capture session every second until it succeeds. + */ + NVFBC_BOOL bDisableAutoModesetRecovery; + /*! + * [in] Whether NvFBC should round the requested frameSize. + * + * When disabled, NvFBC will not attempt to round the requested resolution. + * + * However, some pixel formats have resolution requirements. E.g., YUV/NV + * formats must have a width being a multiple of 4, and a height being a + * multiple of 2. RGB formats don't have such requirements. + * + * If the resolution doesn't meet the requirements of the format, then NvFBC + * will fail at setup time. + * + * When enabled, NvFBC will round the requested width to the next multiple + * of 4 and the requested height to the next multiple of 2. + * + * In this case, requesting any resolution will always work with every + * format. However, an NvFBC client must be prepared to handle the case + * where the requested resolution is different than the captured resolution. + * + * NVFBC_FRAME_GRAB_INFO::dwWidth and NVFBC_FRAME_GRAB_INFO::dwHeight should + * always be used for getting information about captured frames. + */ + NVFBC_BOOL bRoundFrameSize; + /*! + * [in] Rate in ms at which the display server generates new frames + * + * This controls the frequency at which the display server will generate + * new frames if new content is available. This effectively controls the + * capture rate when using blocking calls. + * + * Note that lower values will increase the CPU and GPU loads. + * + * The default value is 16ms (~ 60 Hz). + */ + uint32_t dwSamplingRateMs; + /*! + * [in] Enable push model for frame capture + * + * When set to NVFBC_TRUE, the display server will generate frames whenever + * it receives a damage event from applications. + * + * Setting this to NVFBC_TRUE will ignore ::dwSamplingRateMs. + * + * Using push model with the NVFBC_*_GRAB_FLAGS_NOWAIT_IF_NEW_FRAME_READY + * capture flag should guarantee the shortest amount of time between an + * application rendering a frame and an NvFBC client capturing it, provided + * that the NvFBC client is able to process the frames quickly enough. + * + * Note that applications running at high frame rates will increase CPU and + * GPU loads. + */ + NVFBC_BOOL bPushModel; + /*! + * [in] Allow direct capture + * + * Direct capture allows NvFBC to attach itself to a fullscreen graphics + * application. Whenever that application presents a frame, it makes a copy + * of it directly into a buffer owned by NvFBC thus bypassing the X server. + * + * When direct capture is *not* enabled, the NVIDIA X driver generates a + * frame for NvFBC when it receives a damage event from an application if push + * model is enabled, or periodically checks if there are any pending damage + * events otherwise (see NVFBC_CREATE_CAPTURE_SESSION_PARAMS::dwSamplingRateMs). + * + * Direct capture is possible under the following conditions: + * - Direct capture is allowed + * - Push model is enabled (see NVFBC_CREATE_CAPTURE_SESSION_PARAMS::bPushModel) + * - The mouse cursor is not composited (see NVFBC_CREATE_CAPTURE_SESSION_PARAMS::bWithCursor) + * - No viewport transformation is required. This happens when the remote + * desktop is e.g. rotated. + * + * When direct capture is possible, NvFBC will automatically attach itself + * to a fullscreen unoccluded application, if such exists. + * + * Notes: + * - This includes compositing desktops such as GNOME (e.g., gnome-shell + * is the fullscreen unoccluded application). + * - There can be only one fullscreen unoccluded application at a time. + * - The NVIDIA X driver monitors which application qualifies or no + * longer qualifies. + * + * For example, if a fullscreen application is launched in GNOME, NvFBC will + * detach from gnome-shell and attach to that application. + * + * Attaching and detaching happens automatically from the perspective of an + * NvFBC client. When detaching from an application, the X driver will + * transparently resume generating frames for NvFBC. + * + * An application can know whether a given frame was obtained through + * direct capture by checking NVFBC_FRAME_GRAB_INFO::bDirectCapture. + */ + NVFBC_BOOL bAllowDirectCapture; +} NVFBC_CREATE_CAPTURE_SESSION_PARAMS; + +/*! + * NVFBC_CREATE_CAPTURE_SESSION_PARAMS structure version. + */ +#define NVFBC_CREATE_CAPTURE_SESSION_PARAMS_VER NVFBC_STRUCT_VERSION(NVFBC_CREATE_CAPTURE_SESSION_PARAMS, 6) + +/*! + * Defines parameters for the ::NvFBCDestroyCaptureSession() API call. + */ +typedef struct _NVFBC_DESTROY_CAPTURE_SESSION_PARAMS +{ + /*! + * [in] Must be set to NVFBC_DESTROY_CAPTURE_SESSION_PARAMS_VER + */ + uint32_t dwVersion; +} NVFBC_DESTROY_CAPTURE_SESSION_PARAMS; + +/*! + * NVFBC_DESTROY_CAPTURE_SESSION_PARAMS structure version. + */ +#define NVFBC_DESTROY_CAPTURE_SESSION_PARAMS_VER NVFBC_STRUCT_VERSION(NVFBC_DESTROY_CAPTURE_SESSION_PARAMS, 1) + +/*! + * Defines parameters for the ::NvFBCBindContext() API call. + */ +typedef struct _NVFBC_BIND_CONTEXT_PARAMS +{ + /*! + * [in] Must be set to NVFBC_BIND_CONTEXT_PARAMS_VER + */ + uint32_t dwVersion; +} NVFBC_BIND_CONTEXT_PARAMS; + +/*! + * NVFBC_BIND_CONTEXT_PARAMS structure version. + */ +#define NVFBC_BIND_CONTEXT_PARAMS_VER NVFBC_STRUCT_VERSION(NVFBC_BIND_CONTEXT_PARAMS, 1) + +/*! + * Defines parameters for the ::NvFBCReleaseContext() API call. + */ +typedef struct _NVFBC_RELEASE_CONTEXT_PARAMS +{ + /*! + * [in] Must be set to NVFBC_RELEASE_CONTEXT_PARAMS_VER + */ + uint32_t dwVersion; +} NVFBC_RELEASE_CONTEXT_PARAMS; + +/*! + * NVFBC_RELEASE_CONTEXT_PARAMS structure version. + */ +#define NVFBC_RELEASE_CONTEXT_PARAMS_VER NVFBC_STRUCT_VERSION(NVFBC_RELEASE_CONTEXT_PARAMS, 1) + +/*! + * Defines flags that can be used when capturing to system memory. + */ +typedef enum +{ + /*! + * Default, capturing waits for a new frame or mouse move. + * + * The default behavior of blocking grabs is to wait for a new frame until + * after the call was made. But it's possible that there is a frame already + * ready that the client hasn't seen. + * \see NVFBC_TOSYS_GRAB_FLAGS_NOWAIT_IF_NEW_FRAME_READY + */ + NVFBC_TOSYS_GRAB_FLAGS_NOFLAGS = 0, + /*! + * Capturing does not wait for a new frame nor a mouse move. + * + * It is therefore possible to capture the same frame multiple times. + * When this occurs, the dwCurrentFrame parameter of the + * NVFBC_FRAME_GRAB_INFO structure is not incremented. + */ + NVFBC_TOSYS_GRAB_FLAGS_NOWAIT = (1 << 0), + /*! + * Forces the destination buffer to be refreshed even if the frame has not + * changed since previous capture. + * + * By default, if the captured frame is identical to the previous one, NvFBC + * will omit one copy and not update the destination buffer. + * + * Setting that flag will prevent this behavior. This can be useful e.g., + * if the application has modified the buffer in the meantime. + */ + NVFBC_TOSYS_GRAB_FLAGS_FORCE_REFRESH = (1 << 1), + /*! + * Similar to NVFBC_TOSYS_GRAB_FLAGS_NOFLAGS, except that the capture will + * not wait if there is already a frame available that the client has + * never seen yet. + */ + NVFBC_TOSYS_GRAB_FLAGS_NOWAIT_IF_NEW_FRAME_READY = (1 << 2), +} NVFBC_TOSYS_GRAB_FLAGS; + +/*! + * Defines parameters for the ::NvFBCToSysSetUp() API call. + */ +typedef struct _NVFBC_TOSYS_SETUP_PARAMS +{ + /*! + * [in] Must be set to NVFBC_TOSYS_SETUP_PARAMS_VER + */ + uint32_t dwVersion; + /*! + * [in] Desired buffer format. + */ + NVFBC_BUFFER_FORMAT eBufferFormat; + /*! + * [out] Pointer to a pointer to a buffer in system memory. + * + * This buffer contains the pixel value of the requested format. Refer to + * the description of the buffer formats to understand the memory layout. + * + * The application does not need to allocate memory for this buffer. It + * should not free this buffer either. This buffer is automatically + * re-allocated when needed (e.g., when the resolution changes). + * + * This buffer is allocated by the NvFBC library to the proper size. This + * size is returned in the dwByteSize field of the + * ::NVFBC_FRAME_GRAB_INFO structure. + */ + void **ppBuffer; + /*! + * [in] Whether differential maps should be generated. + */ + NVFBC_BOOL bWithDiffMap; + /*! + * [out] Pointer to a pointer to a buffer in system memory. + * + * This buffer contains the differential map of two frames. It must be read + * as an array of unsigned char. Each unsigned char is either 0 or + * non-zero. 0 means that the pixel value at the given location has not + * changed since the previous captured frame. Non-zero means that the pixel + * value has changed. + * + * The application does not need to allocate memory for this buffer. It + * should not free this buffer either. This buffer is automatically + * re-allocated when needed (e.g., when the resolution changes). + * + * This buffer is allocated by the NvFBC library to the proper size. The + * size of the differential map is returned in ::diffMapSize. + * + * This option is not compatible with the ::NVFBC_BUFFER_FORMAT_YUV420P and + * ::NVFBC_BUFFER_FORMAT_YUV444P buffer formats. + */ + void **ppDiffMap; + /*! + * [in] Scaling factor of the differential maps. + * + * For example, a scaling factor of 16 means that one pixel of the diffmap + * will represent 16x16 pixels of the original frames. + * + * If any of these 16x16 pixels is different between the current and the + * previous frame, then the corresponding pixel in the diffmap will be set + * to non-zero. + * + * The default scaling factor is 1. A dwDiffMapScalingFactor of 0 will be + * set to 1. + */ + uint32_t dwDiffMapScalingFactor; + /*! + * [out] Size of the differential map. + * + * Only set if bWithDiffMap is set to NVFBC_TRUE. + */ + NVFBC_SIZE diffMapSize; +} NVFBC_TOSYS_SETUP_PARAMS; + +/*! + * NVFBC_TOSYS_SETUP_PARAMS structure version. + */ +#define NVFBC_TOSYS_SETUP_PARAMS_VER NVFBC_STRUCT_VERSION(NVFBC_TOSYS_SETUP_PARAMS, 3) + +/*! + * Defines parameters for the ::NvFBCToSysGrabFrame() API call. + */ +typedef struct _NVFBC_TOSYS_GRAB_FRAME_PARAMS +{ + /*! + * [in] Must be set to NVFBC_TOSYS_GRAB_FRAME_PARAMS_VER + */ + uint32_t dwVersion; + /*! + * [in] Flags defining the behavior of this frame capture. + */ + uint32_t dwFlags; + /*! + * [out] Information about the captured frame. + * + * Can be NULL. + */ + NVFBC_FRAME_GRAB_INFO *pFrameGrabInfo; + /*! + * [in] Wait timeout in milliseconds. + * + * When capturing frames with the NVFBC_TOSYS_GRAB_FLAGS_NOFLAGS or + * NVFBC_TOSYS_GRAB_FLAGS_NOWAIT_IF_NEW_FRAME_READY flags, + * NvFBC will wait for a new frame or mouse move until the below timer + * expires. + * + * When timing out, the last captured frame will be returned. Note that as + * long as the NVFBC_TOSYS_GRAB_FLAGS_FORCE_REFRESH flag is not set, + * returning an old frame will incur no performance penalty. + * + * NvFBC clients can use the return value of the grab frame operation to + * find out whether a new frame was captured, or the timer expired. + * + * Note that the behavior of blocking calls is to wait for a new frame + * *after* the call has been made. When using timeouts, it is possible + * that NvFBC will return a new frame (e.g., it has never been captured + * before) even though no new frame was generated after the grab call. + * + * For the precise definition of what constitutes a new frame, see + * ::bIsNewFrame. + * + * Set to 0 to disable timeouts. + */ + uint32_t dwTimeoutMs; +} NVFBC_TOSYS_GRAB_FRAME_PARAMS; + +/*! + * NVFBC_TOSYS_GRAB_FRAME_PARAMS structure version. + */ +#define NVFBC_TOSYS_GRAB_FRAME_PARAMS_VER NVFBC_STRUCT_VERSION(NVFBC_TOSYS_GRAB_FRAME_PARAMS, 2) + +/*! + * Defines flags that can be used when capturing to a CUDA buffer in video memory. + */ +typedef enum +{ + /*! + * Default, capturing waits for a new frame or mouse move. + * + * The default behavior of blocking grabs is to wait for a new frame until + * after the call was made. But it's possible that there is a frame already + * ready that the client hasn't seen. + * \see NVFBC_TOCUDA_GRAB_FLAGS_NOWAIT_IF_NEW_FRAME_READY + */ + NVFBC_TOCUDA_GRAB_FLAGS_NOFLAGS = 0, + /*! + * Capturing does not wait for a new frame nor a mouse move. + * + * It is therefore possible to capture the same frame multiple times. + * When this occurs, the dwCurrentFrame parameter of the + * NVFBC_FRAME_GRAB_INFO structure is not incremented. + */ + NVFBC_TOCUDA_GRAB_FLAGS_NOWAIT = (1 << 0), + /*! + * [in] Forces the destination buffer to be refreshed even if the frame + * has not changed since previous capture. + * + * By default, if the captured frame is identical to the previous one, NvFBC + * will omit one copy and not update the destination buffer. + * + * Setting that flag will prevent this behavior. This can be useful e.g., + * if the application has modified the buffer in the meantime. + */ + NVFBC_TOCUDA_GRAB_FLAGS_FORCE_REFRESH = (1 << 1), + /*! + * Similar to NVFBC_TOCUDA_GRAB_FLAGS_NOFLAGS, except that the capture will + * not wait if there is already a frame available that the client has + * never seen yet. + */ + NVFBC_TOCUDA_GRAB_FLAGS_NOWAIT_IF_NEW_FRAME_READY = (1 << 2), +} NVFBC_TOCUDA_FLAGS; + +/*! + * Defines parameters for the ::NvFBCToCudaSetUp() API call. + */ +typedef struct _NVFBC_TOCUDA_SETUP_PARAMS +{ + /*! + * [in] Must be set to NVFBC_TOCUDA_SETUP_PARAMS_VER + */ + uint32_t dwVersion; + /*! + * [in] Desired buffer format. + */ + NVFBC_BUFFER_FORMAT eBufferFormat; +} NVFBC_TOCUDA_SETUP_PARAMS; + +/*! + * NVFBC_TOCUDA_SETUP_PARAMS structure version. + */ +#define NVFBC_TOCUDA_SETUP_PARAMS_VER NVFBC_STRUCT_VERSION(NVFBC_TOCUDA_SETUP_PARAMS, 1) + +/*! + * Defines parameters for the ::NvFBCToCudaGrabFrame() API call. + */ +typedef struct _NVFBC_TOCUDA_GRAB_FRAME_PARAMS +{ + /*! + * [in] Must be set to NVFBC_TOCUDA_GRAB_FRAME_PARAMS_VER. + */ + uint32_t dwVersion; + /*! + * [in] Flags defining the behavior of this frame capture. + */ + uint32_t dwFlags; + /*! + * [out] Pointer to a ::CUdeviceptr + * + * The application does not need to allocate memory for this CUDA device. + * + * The application does need to create its own CUDA context to use this + * CUDA device. + * + * This ::CUdeviceptr will be mapped to a segment in video memory containing + * the frame. It is not possible to process a CUDA device while capturing + * a new frame. If the application wants to do so, it must copy the CUDA + * device using ::cuMemcpyDtoD or ::cuMemcpyDtoH beforehand. + */ + void *pCUDADeviceBuffer; + /*! + * [out] Information about the captured frame. + * + * Can be NULL. + */ + NVFBC_FRAME_GRAB_INFO *pFrameGrabInfo; + /*! + * [in] Wait timeout in milliseconds. + * + * When capturing frames with the NVFBC_TOCUDA_GRAB_FLAGS_NOFLAGS or + * NVFBC_TOCUDA_GRAB_FLAGS_NOWAIT_IF_NEW_FRAME_READY flags, + * NvFBC will wait for a new frame or mouse move until the below timer + * expires. + * + * When timing out, the last captured frame will be returned. Note that as + * long as the NVFBC_TOCUDA_GRAB_FLAGS_FORCE_REFRESH flag is not set, + * returning an old frame will incur no performance penalty. + * + * NvFBC clients can use the return value of the grab frame operation to + * find out whether a new frame was captured, or the timer expired. + * + * Note that the behavior of blocking calls is to wait for a new frame + * *after* the call has been made. When using timeouts, it is possible + * that NvFBC will return a new frame (e.g., it has never been captured + * before) even though no new frame was generated after the grab call. + * + * For the precise definition of what constitutes a new frame, see + * ::bIsNewFrame. + * + * Set to 0 to disable timeouts. + */ + uint32_t dwTimeoutMs; +} NVFBC_TOCUDA_GRAB_FRAME_PARAMS; + +/*! + * NVFBC_TOCUDA_GRAB_FRAME_PARAMS structure version. + */ +#define NVFBC_TOCUDA_GRAB_FRAME_PARAMS_VER NVFBC_STRUCT_VERSION(NVFBC_TOCUDA_GRAB_FRAME_PARAMS, 2) + +/*! + * Defines flags that can be used when capturing to an OpenGL buffer in video memory. + */ +typedef enum +{ + /*! + * Default, capturing waits for a new frame or mouse move. + * + * The default behavior of blocking grabs is to wait for a new frame until + * after the call was made. But it's possible that there is a frame already + * ready that the client hasn't seen. + * \see NVFBC_TOGL_GRAB_FLAGS_NOWAIT_IF_NEW_FRAME_READY + */ + NVFBC_TOGL_GRAB_FLAGS_NOFLAGS = 0, + /*! + * Capturing does not wait for a new frame nor a mouse move. + * + * It is therefore possible to capture the same frame multiple times. + * When this occurs, the dwCurrentFrame parameter of the + * NVFBC_FRAME_GRAB_INFO structure is not incremented. + */ + NVFBC_TOGL_GRAB_FLAGS_NOWAIT = (1 << 0), + /*! + * [in] Forces the destination buffer to be refreshed even if the frame + * has not changed since previous capture. + * + * By default, if the captured frame is identical to the previous one, NvFBC + * will omit one copy and not update the destination buffer. + * + * Setting that flag will prevent this behavior. This can be useful e.g., + * if the application has modified the buffer in the meantime. + */ + NVFBC_TOGL_GRAB_FLAGS_FORCE_REFRESH = (1 << 1), + /*! + * Similar to NVFBC_TOGL_GRAB_FLAGS_NOFLAGS, except that the capture will + * not wait if there is already a frame available that the client has + * never seen yet. + */ + NVFBC_TOGL_GRAB_FLAGS_NOWAIT_IF_NEW_FRAME_READY = (1 << 2), +} NVFBC_TOGL_FLAGS; + +/*! + * Maximum number of GL textures that can be used to store frames. + */ +#define NVFBC_TOGL_TEXTURES_MAX 2 + +/*! + * Defines parameters for the ::NvFBCToGLSetUp() API call. + */ +typedef struct _NVFBC_TOGL_SETUP_PARAMS +{ + /*! + * [in] Must be set to NVFBC_TOGL_SETUP_PARAMS_VER + */ + uint32_t dwVersion; + /*! + * [in] Desired buffer format. + */ + NVFBC_BUFFER_FORMAT eBufferFormat; + /*! + * [in] Whether differential maps should be generated. + */ + NVFBC_BOOL bWithDiffMap; + /*! + * [out] Pointer to a pointer to a buffer in system memory. + * + * \see NVFBC_TOSYS_SETUP_PARAMS::ppDiffMap + */ + void **ppDiffMap; + /*! + * [in] Scaling factor of the differential maps. + * + * \see NVFBC_TOSYS_SETUP_PARAMS::dwDiffMapScalingFactor + */ + uint32_t dwDiffMapScalingFactor; + /*! + * [out] List of GL textures that will store the captured frames. + * + * This array is 0 terminated. The number of textures varies depending on + * the capture settings (such as whether diffmaps are enabled). + * + * An application wishing to interop with, for example, EncodeAPI will need + * to register these textures prior to start encoding frames. + * + * After each frame capture, the texture holding the current frame will be + * returned in NVFBC_TOGL_GRAB_FRAME_PARAMS::dwTexture. + */ + uint32_t dwTextures[NVFBC_TOGL_TEXTURES_MAX]; + /*! + * [out] GL target to which the texture should be bound. + */ + uint32_t dwTexTarget; + /*! + * [out] GL format of the textures. + */ + uint32_t dwTexFormat; + /*! + * [out] GL type of the textures. + */ + uint32_t dwTexType; + /*! + * [out] Size of the differential map. + * + * Only set if bWithDiffMap is set to NVFBC_TRUE. + */ + NVFBC_SIZE diffMapSize; +} NVFBC_TOGL_SETUP_PARAMS; + +/*! + * NVFBC_TOGL_SETUP_PARAMS structure version. + */ +#define NVFBC_TOGL_SETUP_PARAMS_VER NVFBC_STRUCT_VERSION(NVFBC_TOGL_SETUP_PARAMS, 2) + +/*! + * Defines parameters for the ::NvFBCToGLGrabFrame() API call. + */ +typedef struct _NVFBC_TOGL_GRAB_FRAME_PARAMS +{ + /*! + * [in] Must be set to NVFBC_TOGL_GRAB_FRAME_PARAMS_VER. + */ + uint32_t dwVersion; + /*! + * [in] Flags defining the behavior of this frame capture. + */ + uint32_t dwFlags; + /*! + * [out] Index of the texture storing the current frame. + * + * This is an index in the NVFBC_TOGL_SETUP_PARAMS::dwTextures array. + */ + uint32_t dwTextureIndex; + /*! + * [out] Information about the captured frame. + * + * Can be NULL. + */ + NVFBC_FRAME_GRAB_INFO *pFrameGrabInfo; + /*! + * [in] Wait timeout in milliseconds. + * + * When capturing frames with the NVFBC_TOGL_GRAB_FLAGS_NOFLAGS or + * NVFBC_TOGL_GRAB_FLAGS_NOWAIT_IF_NEW_FRAME_READY flags, + * NvFBC will wait for a new frame or mouse move until the below timer + * expires. + * + * When timing out, the last captured frame will be returned. Note that as + * long as the NVFBC_TOGL_GRAB_FLAGS_FORCE_REFRESH flag is not set, + * returning an old frame will incur no performance penalty. + * + * NvFBC clients can use the return value of the grab frame operation to + * find out whether a new frame was captured, or the timer expired. + * + * Note that the behavior of blocking calls is to wait for a new frame + * *after* the call has been made. When using timeouts, it is possible + * that NvFBC will return a new frame (e.g., it has never been captured + * before) even though no new frame was generated after the grab call. + * + * For the precise definition of what constitutes a new frame, see + * ::bIsNewFrame. + * + * Set to 0 to disable timeouts. + */ + uint32_t dwTimeoutMs; +} NVFBC_TOGL_GRAB_FRAME_PARAMS; + +/*! + * NVFBC_TOGL_GRAB_FRAME_PARAMS structure version. + */ +#define NVFBC_TOGL_GRAB_FRAME_PARAMS_VER NVFBC_STRUCT_VERSION(NVFBC_TOGL_GRAB_FRAME_PARAMS, 2) + +/*! @} FBC_STRUCT */ + +/*! + * \defgroup FBC_FUNC API Entry Points + * + * Entry points are thread-safe and can be called concurrently. + * + * The locking model includes a global lock that protects session handle + * management (\see NvFBCCreateHandle, \see NvFBCDestroyHandle). + * + * Each NvFBC session uses a local lock to protect other entry points. Note + * that in certain cases, a thread can hold the local lock for an undefined + * amount of time, such as grabbing a frame using a blocking call. + * + * Note that a context is associated with each session. NvFBC clients wishing + * to share a session between different threads are expected to release and + * bind the context appropriately (\see NvFBCBindContext, + * \see NvFBCReleaseContext). This is not required when each thread uses its + * own NvFBC session. + * + * @{ + */ + +/*! + * Gets the last error message that got recorded for a client. + * + * When NvFBC returns an error, it will save an error message that can be + * queried through this API call. Only the last message is saved. + * The message and the return code should give enough information about + * what went wrong. + * + * \param [in] sessionHandle + * Handle to the NvFBC client. + * \return + * A NULL terminated error message, or an empty string. Its maximum length + * is NVFBC_ERROR_STR_LEN. + */ +const char* NVFBCAPI NvFBCGetLastErrorStr(const NVFBC_SESSION_HANDLE sessionHandle); + +/*! + * \brief Allocates a new handle for an NvFBC client. + * + * This function allocates a session handle used to identify an FBC client. + * + * This function implicitly calls NvFBCBindContext(). + * + * \param [out] pSessionHandle + * Pointer that will hold the allocated session handle. + * \param [in] pParams + * ::NVFBC_CREATE_HANDLE_PARAMS + * + * \return + * ::NVFBC_SUCCESS \n + * ::NVFBC_ERR_INVALID_PTR \n + * ::NVFBC_ERR_API_VERSION \n + * ::NVFBC_ERR_INTERNAL \n + * ::NVFBC_ERR_OUT_OF_MEMORY \n + * ::NVFBC_ERR_MAX_CLIENTS \n + * ::NVFBC_ERR_X \n + * ::NVFBC_ERR_GLX \n + * ::NVFBC_ERR_GL + * + */ +NVFBCSTATUS NVFBCAPI NvFBCCreateHandle(NVFBC_SESSION_HANDLE *pSessionHandle, NVFBC_CREATE_HANDLE_PARAMS *pParams); + +/*! + * \brief Destroys the handle of an NvFBC client. + * + * This function uninitializes an FBC client. + * + * This function implicitly calls NvFBCReleaseContext(). + * + * After this fucntion returns, it is not possible to use this session handle + * for any further API call. + * + * \param [in] sessionHandle + * FBC session handle. + * \param [in] pParams + * ::NVFBC_DESTROY_HANDLE_PARAMS + * + * \return + * ::NVFBC_SUCCESS \n + * ::NVFBC_ERR_INVALID_HANDLE \n + * ::NVFBC_ERR_API_VERSION \n + * ::NVFBC_ERR_BAD_REQUEST \n + * ::NVFBC_ERR_INTERNAL \n + * ::NVFBC_ERR_CONTEXT \n + * ::NVFBC_ERR_X + */ +NVFBCSTATUS NVFBCAPI NvFBCDestroyHandle(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_DESTROY_HANDLE_PARAMS *pParams); + +/*! + * \brief Gets the current status of the display driver. + * + * This function queries the display driver for various information. + * + * \param [in] sessionHandle + * FBC session handle. + * \param [in] pParams + * ::NVFBC_GET_STATUS_PARAMS + * + * \return + * ::NVFBC_SUCCESS \n + * ::NVFBC_ERR_INVALID_HANDLE \n + * ::NVFBC_ERR_API_VERSION \n + * ::NVFBC_ERR_INTERNAL \n + * ::NVFBC_ERR_X + */ +NVFBCSTATUS NVFBCAPI NvFBCGetStatus(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_GET_STATUS_PARAMS *pParams); + +/*! + * \brief Binds the FBC context to the calling thread. + * + * The NvFBC library internally relies on objects that must be bound to a + * thread. Such objects are OpenGL contexts and CUDA contexts. + * + * This function binds these objects to the calling thread. + * + * The FBC context must be bound to the calling thread for most NvFBC entry + * points, otherwise ::NVFBC_ERR_CONTEXT is returned. + * + * If the FBC context is already bound to a different thread, + * ::NVFBC_ERR_CONTEXT is returned. The other thread must release the context + * first by calling the ReleaseContext() entry point. + * + * If the FBC context is already bound to the current thread, this function has + * no effects. + * + * \param [in] sessionHandle + * FBC session handle. + * \param [in] pParams + * ::NVFBC_DESTROY_CAPTURE_SESSION_PARAMS + * + * \return + * ::NVFBC_SUCCESS \n + * ::NVFBC_ERR_INVALID_HANDLE \n + * ::NVFBC_ERR_API_VERSION \n + * ::NVFBC_ERR_BAD_REQUEST \n + * ::NVFBC_ERR_CONTEXT \n + * ::NVFBC_ERR_INTERNAL \n + * ::NVFBC_ERR_X + */ +NVFBCSTATUS NVFBCAPI NvFBCBindContext(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_BIND_CONTEXT_PARAMS *pParams); + +/*! + * \brief Releases the FBC context from the calling thread. + * + * If the FBC context is bound to a different thread, ::NVFBC_ERR_CONTEXT is + * returned. + * + * If the FBC context is already released, this functino has no effects. + * + * \param [in] sessionHandle + * FBC session handle. + * \param [in] pParams + * ::NVFBC_SUCCESS \n + * ::NVFBC_ERR_INVALID_HANDLE \n + * ::NVFBC_ERR_API_VERSION \n + * ::NVFBC_ERR_BAD_REQUEST \n + * ::NVFBC_ERR_CONTEXT \n + * ::NVFBC_ERR_INTERNAL \n + * ::NVFBC_ERR_X + */ +NVFBCSTATUS NVFBCAPI NvFBCReleaseContext(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_RELEASE_CONTEXT_PARAMS *pParams); + +/*! + * \brief Creates a capture session for an FBC client. + * + * This function starts a capture session of the desired type (system memory, + * video memory with CUDA interop, or H.264 compressed frames in system memory). + * + * Not all types are supported on all systems. Also, it is possible to use + * NvFBC without having the CUDA library. In this case, requesting a capture + * session of the concerned type will return an error. + * + * After this function returns, the display driver will start generating frames + * that can be captured using the corresponding API call. + * + * \param [in] sessionHandle + * FBC session handle. + * \param [in] pParams + * ::NVFBC_CREATE_CAPTURE_SESSION_PARAMS + * + * \return + * ::NVFBC_SUCCESS \n + * ::NVFBC_ERR_INVALID_HANDLE \n + * ::NVFBC_ERR_API_VERSION \n + * ::NVFBC_ERR_BAD_REQUEST \n + * ::NVFBC_ERR_CONTEXT \n + * ::NVFBC_ERR_INVALID_PARAM \n + * ::NVFBC_ERR_OUT_OF_MEMORY \n + * ::NVFBC_ERR_X \n + * ::NVFBC_ERR_GLX \n + * ::NVFBC_ERR_GL \n + * ::NVFBC_ERR_CUDA \n + * ::NVFBC_ERR_MUST_RECREATE \n + * ::NVFBC_ERR_INTERNAL + */ +NVFBCSTATUS NVFBCAPI NvFBCCreateCaptureSession(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_CREATE_CAPTURE_SESSION_PARAMS *pParams); + +/*! + * \brief Destroys a capture session for an FBC client. + * + * This function stops a capture session and frees allocated objects. + * + * After this function returns, it is possible to create another capture + * session using the corresponding API call. + * + * \param [in] sessionHandle + * FBC session handle. + * \param [in] pParams + * ::NVFBC_DESTROY_CAPTURE_SESSION_PARAMS + * + * \return + * ::NVFBC_SUCCESS \n + * ::NVFBC_ERR_INVALID_HANDLE \n + * ::NVFBC_ERR_API_VERSION \n + * ::NVFBC_ERR_BAD_REQUEST \n + * ::NVFBC_ERR_CONTEXT \n + * ::NVFBC_ERR_INTERNAL \n + * ::NVFBC_ERR_X + */ +NVFBCSTATUS NVFBCAPI NvFBCDestroyCaptureSession(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_DESTROY_CAPTURE_SESSION_PARAMS *pParams); + +/*! + * \brief Sets up a capture to system memory session. + * + * This function configures how the capture to system memory should behave. It + * can be called anytime and several times after the capture session has been + * created. However, it must be called at least once prior to start capturing + * frames. + * + * This function allocates the buffer that will contain the captured frame. + * The application does not need to free this buffer. The size of this buffer + * is returned in the ::NVFBC_FRAME_GRAB_INFO structure. + * + * \param [in] sessionHandle + * FBC session handle. + * \param [in] pParams + * ::NVFBC_TOSYS_SETUP_PARAMS + * + * \return + * ::NVFBC_SUCCESS \n + * ::NVFBC_ERR_INVALID_HANDLE \n + * ::NVFBC_ERR_API_VERSION \n + * ::NVFBC_ERR_BAD_REQUEST \n + * ::NVFBC_ERR_INTERNAL \n + * ::NVFBC_ERR_CONTEXT \n + * ::NVFBC_ERR_UNSUPPORTED \n + * ::NVFBC_ERR_INVALID_PTR \n + * ::NVFBC_ERR_INVALID_PARAM \n + * ::NVFBC_ERR_OUT_OF_MEMORY \n + * ::NVFBC_ERR_X + */ +NVFBCSTATUS NVFBCAPI NvFBCToSysSetUp(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_TOSYS_SETUP_PARAMS *pParams); + +/*! + * \brief Captures a frame to a buffer in system memory. + * + * This function triggers a frame capture to a buffer in system memory that was + * registered with the ToSysSetUp() API call. + * + * Note that it is possible that the resolution of the desktop changes while + * capturing frames. This should be transparent for the application. + * + * When the resolution changes, the capture session is recreated using the same + * parameters, and necessary buffers are re-allocated. The frame counter is not + * reset. + * + * An application can detect that the resolution changed by comparing the + * dwByteSize member of the ::NVFBC_FRAME_GRAB_INFO against a previous + * frame and/or dwWidth and dwHeight. + * + * During a change of resolution the capture is paused even in asynchronous + * mode. + * + * \param [in] sessionHandle + * FBC session handle. + * \param [in] pParams + * ::NVFBC_TOSYS_GRAB_FRAME_PARAMS + * + * \return + * ::NVFBC_SUCCESS \n + * ::NVFBC_ERR_INVALID_HANDLE \n + * ::NVFBC_ERR_API_VERSION \n + * ::NVFBC_ERR_BAD_REQUEST \n + * ::NVFBC_ERR_CONTEXT \n + * ::NVFBC_ERR_INVALID_PTR \n + * ::NVFBC_ERR_INTERNAL \n + * ::NVFBC_ERR_X \n + * ::NVFBC_ERR_MUST_RECREATE \n + * \see NvFBCCreateCaptureSession \n + * \see NvFBCToSysSetUp + */ +NVFBCSTATUS NVFBCAPI NvFBCToSysGrabFrame(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_TOSYS_GRAB_FRAME_PARAMS *pParams); + +/*! + * \brief Sets up a capture to video memory session. + * + * This function configures how the capture to video memory with CUDA interop + * should behave. It can be called anytime and several times after the capture + * session has been created. However, it must be called at least once prior + * to start capturing frames. + * + * \param [in] sessionHandle + * FBC session handle. + * + * \param [in] pParams + * ::NVFBC_TOCUDA_SETUP_PARAMS + * + * \return + * ::NVFBC_SUCCESS \n + * ::NVFBC_ERR_INVALID_HANDLE \n + * ::NVFBC_ERR_API_VERSION \n + * ::NVFBC_ERR_BAD_REQUEST \n + * ::NVFBC_ERR_INTERNAL \n + * ::NVFBC_ERR_CONTEXT \n + * ::NVFBC_ERR_UNSUPPORTED \n + * ::NVFBC_ERR_GL \n + * ::NVFBC_ERR_X + */ +NVFBCSTATUS NVFBCAPI NvFBCToCudaSetUp(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_TOCUDA_SETUP_PARAMS *pParams); + +/*! + * \brief Captures a frame to a CUDA device in video memory. + * + * This function triggers a frame capture to a CUDA device in video memory. + * + * Note about changes of resolution: \see NvFBCToSysGrabFrame + * + * \param [in] sessionHandle + * FBC session handle. + * + * \param [in] pParams + * ::NVFBC_TOCUDA_GRAB_FRAME_PARAMS + * + * \return + * ::NVFBC_SUCCESS \n + * ::NVFBC_ERR_INVALID_HANDLE \n + * ::NVFBC_ERR_API_VERSION \n + * ::NVFBC_ERR_BAD_REQUEST \n + * ::NVFBC_ERR_CONTEXT \n + * ::NVFBC_ERR_INVALID_PTR \n + * ::NVFBC_ERR_CUDA \n + * ::NVFBC_ERR_INTERNAL \n + * ::NVFBC_ERR_X \n + * ::NVFBC_ERR_MUST_RECREATE \n + * \see NvFBCCreateCaptureSession \n + * \see NvFBCToCudaSetUp + */ +NVFBCSTATUS NVFBCAPI NvFBCToCudaGrabFrame(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_TOCUDA_GRAB_FRAME_PARAMS *pParams); + +/*! + * \brief Sets up a capture to OpenGL buffer in video memory session. + * + * This function configures how the capture to video memory should behave. + * It can be called anytime and several times after the capture session has been + * created. However, it must be called at least once prior to start capturing + * frames. + * + * \param [in] sessionHandle + * FBC session handle. + * + * \param [in] pParams + * ::NVFBC_TOGL_SETUP_PARAMS + * + * \return + * ::NVFBC_SUCCESS \n + * ::NVFBC_ERR_INVALID_HANDLE \n + * ::NVFBC_ERR_API_VERSION \n + * ::NVFBC_ERR_BAD_REQUEST \n + * ::NVFBC_ERR_INTERNAL \n + * ::NVFBC_ERR_CONTEXT \n + * ::NVFBC_ERR_UNSUPPORTED \n + * ::NVFBC_ERR_GL \n + * ::NVFBC_ERR_X + */ +NVFBCSTATUS NVFBCAPI NvFBCToGLSetUp(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_TOGL_SETUP_PARAMS *pParams); + +/*! + * \brief Captures a frame to an OpenGL buffer in video memory. + * + * This function triggers a frame capture to a selected resource in video memory. + * + * Note about changes of resolution: \see NvFBCToSysGrabFrame + * + * \param [in] sessionHandle + * FBC session handle. + * + * \param [in] pParams + * ::NVFBC_TOGL_GRAB_FRAME_PARAMS + * + * \return + * ::NVFBC_SUCCESS \n + * ::NVFBC_ERR_INVALID_HANDLE \n + * ::NVFBC_ERR_API_VERSION \n + * ::NVFBC_ERR_BAD_REQUEST \n + * ::NVFBC_ERR_CONTEXT \n + * ::NVFBC_ERR_INVALID_PTR \n + * ::NVFBC_ERR_INTERNAL \n + * ::NVFBC_ERR_X \n + * ::NVFBC_ERR_MUST_RECREATE \n + * \see NvFBCCreateCaptureSession \n + * \see NvFBCToCudaSetUp + */ +NVFBCSTATUS NVFBCAPI NvFBCToGLGrabFrame(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_TOGL_GRAB_FRAME_PARAMS *pParams); + +/*! + * \cond FBC_PFN + * + * Defines API function pointers + */ +typedef const char* (NVFBCAPI* PNVFBCGETLASTERRORSTR)(const NVFBC_SESSION_HANDLE sessionHandle); +typedef NVFBCSTATUS (NVFBCAPI* PNVFBCCREATEHANDLE)(NVFBC_SESSION_HANDLE *pSessionHandle, NVFBC_CREATE_HANDLE_PARAMS *pParams); +typedef NVFBCSTATUS (NVFBCAPI* PNVFBCDESTROYHANDLE)(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_DESTROY_HANDLE_PARAMS *pParams); +typedef NVFBCSTATUS (NVFBCAPI* PNVFBCBINDCONTEXT)(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_BIND_CONTEXT_PARAMS *pParams); +typedef NVFBCSTATUS (NVFBCAPI* PNVFBCRELEASECONTEXT)(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_RELEASE_CONTEXT_PARAMS *pParams); +typedef NVFBCSTATUS (NVFBCAPI* PNVFBCGETSTATUS)(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_GET_STATUS_PARAMS *pParams); +typedef NVFBCSTATUS (NVFBCAPI* PNVFBCCREATECAPTURESESSION)(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_CREATE_CAPTURE_SESSION_PARAMS *pParams); +typedef NVFBCSTATUS (NVFBCAPI* PNVFBCDESTROYCAPTURESESSION)(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_DESTROY_CAPTURE_SESSION_PARAMS *pParams); +typedef NVFBCSTATUS (NVFBCAPI* PNVFBCTOSYSSETUP)(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_TOSYS_SETUP_PARAMS *pParams); +typedef NVFBCSTATUS (NVFBCAPI* PNVFBCTOSYSGRABFRAME)(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_TOSYS_GRAB_FRAME_PARAMS *pParams); +typedef NVFBCSTATUS (NVFBCAPI* PNVFBCTOCUDASETUP)(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_TOCUDA_SETUP_PARAMS *pParams); +typedef NVFBCSTATUS (NVFBCAPI* PNVFBCTOCUDAGRABFRAME)(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_TOCUDA_GRAB_FRAME_PARAMS *pParams); +typedef NVFBCSTATUS (NVFBCAPI* PNVFBCTOGLSETUP)(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_TOGL_SETUP_PARAMS *pParams); +typedef NVFBCSTATUS (NVFBCAPI* PNVFBCTOGLGRABFRAME)(const NVFBC_SESSION_HANDLE sessionHandle, NVFBC_TOGL_GRAB_FRAME_PARAMS *pParams); + +/// \endcond + +/*! @} FBC_FUNC */ + +/*! + * \ingroup FBC_STRUCT + * + * Structure populated with API function pointers. + */ +typedef struct +{ + uint32_t dwVersion; //!< [in] Must be set to NVFBC_VERSION. + PNVFBCGETLASTERRORSTR nvFBCGetLastErrorStr; //!< [out] Pointer to ::NvFBCGetLastErrorStr(). + PNVFBCCREATEHANDLE nvFBCCreateHandle; //!< [out] Pointer to ::NvFBCCreateHandle(). + PNVFBCDESTROYHANDLE nvFBCDestroyHandle; //!< [out] Pointer to ::NvFBCDestroyHandle(). + PNVFBCGETSTATUS nvFBCGetStatus; //!< [out] Pointer to ::NvFBCGetStatus(). + PNVFBCCREATECAPTURESESSION nvFBCCreateCaptureSession; //!< [out] Pointer to ::NvFBCCreateCaptureSession(). + PNVFBCDESTROYCAPTURESESSION nvFBCDestroyCaptureSession; //!< [out] Pointer to ::NvFBCDestroyCaptureSession(). + PNVFBCTOSYSSETUP nvFBCToSysSetUp; //!< [out] Pointer to ::NvFBCToSysSetUp(). + PNVFBCTOSYSGRABFRAME nvFBCToSysGrabFrame; //!< [out] Pointer to ::NvFBCToSysGrabFrame(). + PNVFBCTOCUDASETUP nvFBCToCudaSetUp; //!< [out] Pointer to ::NvFBCToCudaSetUp(). + PNVFBCTOCUDAGRABFRAME nvFBCToCudaGrabFrame; //!< [out] Pointer to ::NvFBCToCudaGrabFrame(). + void* pad1; //!< [out] Retired. Do not use. + void* pad2; //!< [out] Retired. Do not use. + void* pad3; //!< [out] Retired. Do not use. + PNVFBCBINDCONTEXT nvFBCBindContext; //!< [out] Pointer to ::NvFBCBindContext(). + PNVFBCRELEASECONTEXT nvFBCReleaseContext; //!< [out] Pointer to ::NvFBCReleaseContext(). + void* pad4; //!< [out] Retired. Do not use. + void* pad5; //!< [out] Retired. Do not use. + void* pad6; //!< [out] Retired. Do not use. + void* pad7; //!< [out] Retired. Do not use. + PNVFBCTOGLSETUP nvFBCToGLSetUp; //!< [out] Pointer to ::nvFBCToGLSetup(). + PNVFBCTOGLGRABFRAME nvFBCToGLGrabFrame; //!< [out] Pointer to ::nvFBCToGLGrabFrame(). +} NVFBC_API_FUNCTION_LIST; + +/*! + * \ingroup FBC_FUNC + * + * \brief Entry Points to the NvFBC interface. + * + * Creates an instance of the NvFBC interface, and populates the + * pFunctionList with function pointers to the API routines implemented by + * the NvFBC interface. + * + * \param [out] pFunctionList + * + * \return + * ::NVFBC_SUCCESS \n + * ::NVFBC_ERR_INVALID_PTR \n + * ::NVFBC_ERR_API_VERSION + */ +NVFBCSTATUS NVFBCAPI NvFBCCreateInstance(NVFBC_API_FUNCTION_LIST *pFunctionList); +/*! + * \ingroup FBC_FUNC + * + * Defines function pointer for the ::NvFBCCreateInstance() API call. + */ +typedef NVFBCSTATUS (NVFBCAPI* PNVFBCCREATEINSTANCE)(NVFBC_API_FUNCTION_LIST *pFunctionList); + +#ifdef __cplusplus +} +#endif + +#endif // _NVFBC_H_ diff --git a/third-party/nvfbc/helper_math.h b/third-party/nvfbc/helper_math.h new file mode 100644 index 00000000..d17b024e --- /dev/null +++ b/third-party/nvfbc/helper_math.h @@ -0,0 +1,1469 @@ +/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * This file implements common mathematical operations on vector types + * (float3, float4 etc.) since these are not provided as standard by CUDA. + * + * The syntax is modeled on the Cg standard library. + * + * This is part of the Helper library includes + * + * Thanks to Linh Hah for additions and fixes. + */ + +#ifndef HELPER_MATH_H +#define HELPER_MATH_H + +#include "cuda_runtime.h" + +typedef unsigned int uint; +typedef unsigned short ushort; + +#ifndef EXIT_WAIVED +#define EXIT_WAIVED 2 +#endif + +#ifndef __CUDACC__ +#include + +//////////////////////////////////////////////////////////////////////////////// +// host implementations of CUDA functions +//////////////////////////////////////////////////////////////////////////////// + +inline float fminf(float a, float b) +{ + return a < b ? a : b; +} + +inline float fmaxf(float a, float b) +{ + return a > b ? a : b; +} + +inline int max(int a, int b) +{ + return a > b ? a : b; +} + +inline int min(int a, int b) +{ + return a < b ? a : b; +} + +inline float rsqrtf(float x) +{ + return 1.0f / sqrtf(x); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// constructors +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float2 make_float2(float s) +{ + return make_float2(s, s); +} +inline __host__ __device__ float2 make_float2(float3 a) +{ + return make_float2(a.x, a.y); +} +inline __host__ __device__ float2 make_float2(int2 a) +{ + return make_float2(float(a.x), float(a.y)); +} +inline __host__ __device__ float2 make_float2(uint2 a) +{ + return make_float2(float(a.x), float(a.y)); +} + +inline __host__ __device__ int2 make_int2(int s) +{ + return make_int2(s, s); +} +inline __host__ __device__ int2 make_int2(int3 a) +{ + return make_int2(a.x, a.y); +} +inline __host__ __device__ int2 make_int2(uint2 a) +{ + return make_int2(int(a.x), int(a.y)); +} +inline __host__ __device__ int2 make_int2(float2 a) +{ + return make_int2(int(a.x), int(a.y)); +} + +inline __host__ __device__ uint2 make_uint2(uint s) +{ + return make_uint2(s, s); +} +inline __host__ __device__ uint2 make_uint2(uint3 a) +{ + return make_uint2(a.x, a.y); +} +inline __host__ __device__ uint2 make_uint2(int2 a) +{ + return make_uint2(uint(a.x), uint(a.y)); +} + +inline __host__ __device__ float3 make_float3(float s) +{ + return make_float3(s, s, s); +} +inline __host__ __device__ float3 make_float3(float2 a) +{ + return make_float3(a.x, a.y, 0.0f); +} +inline __host__ __device__ float3 make_float3(float2 a, float s) +{ + return make_float3(a.x, a.y, s); +} +inline __host__ __device__ float3 make_float3(float4 a) +{ + return make_float3(a.x, a.y, a.z); +} +inline __host__ __device__ float3 make_float3(int3 a) +{ + return make_float3(float(a.x), float(a.y), float(a.z)); +} +inline __host__ __device__ float3 make_float3(uint3 a) +{ + return make_float3(float(a.x), float(a.y), float(a.z)); +} + +inline __host__ __device__ int3 make_int3(int s) +{ + return make_int3(s, s, s); +} +inline __host__ __device__ int3 make_int3(int2 a) +{ + return make_int3(a.x, a.y, 0); +} +inline __host__ __device__ int3 make_int3(int2 a, int s) +{ + return make_int3(a.x, a.y, s); +} +inline __host__ __device__ int3 make_int3(uint3 a) +{ + return make_int3(int(a.x), int(a.y), int(a.z)); +} +inline __host__ __device__ int3 make_int3(float3 a) +{ + return make_int3(int(a.x), int(a.y), int(a.z)); +} + +inline __host__ __device__ uint3 make_uint3(uint s) +{ + return make_uint3(s, s, s); +} +inline __host__ __device__ uint3 make_uint3(uint2 a) +{ + return make_uint3(a.x, a.y, 0); +} +inline __host__ __device__ uint3 make_uint3(uint2 a, uint s) +{ + return make_uint3(a.x, a.y, s); +} +inline __host__ __device__ uint3 make_uint3(uint4 a) +{ + return make_uint3(a.x, a.y, a.z); +} +inline __host__ __device__ uint3 make_uint3(int3 a) +{ + return make_uint3(uint(a.x), uint(a.y), uint(a.z)); +} + +inline __host__ __device__ float4 make_float4(float s) +{ + return make_float4(s, s, s, s); +} +inline __host__ __device__ float4 make_float4(float3 a) +{ + return make_float4(a.x, a.y, a.z, 0.0f); +} +inline __host__ __device__ float4 make_float4(float3 a, float w) +{ + return make_float4(a.x, a.y, a.z, w); +} +inline __host__ __device__ float4 make_float4(int4 a) +{ + return make_float4(float(a.x), float(a.y), float(a.z), float(a.w)); +} +inline __host__ __device__ float4 make_float4(uint4 a) +{ + return make_float4(float(a.x), float(a.y), float(a.z), float(a.w)); +} + +inline __host__ __device__ int4 make_int4(int s) +{ + return make_int4(s, s, s, s); +} +inline __host__ __device__ int4 make_int4(int3 a) +{ + return make_int4(a.x, a.y, a.z, 0); +} +inline __host__ __device__ int4 make_int4(int3 a, int w) +{ + return make_int4(a.x, a.y, a.z, w); +} +inline __host__ __device__ int4 make_int4(uint4 a) +{ + return make_int4(int(a.x), int(a.y), int(a.z), int(a.w)); +} +inline __host__ __device__ int4 make_int4(float4 a) +{ + return make_int4(int(a.x), int(a.y), int(a.z), int(a.w)); +} + + +inline __host__ __device__ uint4 make_uint4(uint s) +{ + return make_uint4(s, s, s, s); +} +inline __host__ __device__ uint4 make_uint4(uint3 a) +{ + return make_uint4(a.x, a.y, a.z, 0); +} +inline __host__ __device__ uint4 make_uint4(uint3 a, uint w) +{ + return make_uint4(a.x, a.y, a.z, w); +} +inline __host__ __device__ uint4 make_uint4(int4 a) +{ + return make_uint4(uint(a.x), uint(a.y), uint(a.z), uint(a.w)); +} + +//////////////////////////////////////////////////////////////////////////////// +// negate +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float2 operator-(float2 &a) +{ + return make_float2(-a.x, -a.y); +} +inline __host__ __device__ int2 operator-(int2 &a) +{ + return make_int2(-a.x, -a.y); +} +inline __host__ __device__ float3 operator-(float3 &a) +{ + return make_float3(-a.x, -a.y, -a.z); +} +inline __host__ __device__ int3 operator-(int3 &a) +{ + return make_int3(-a.x, -a.y, -a.z); +} +inline __host__ __device__ float4 operator-(float4 &a) +{ + return make_float4(-a.x, -a.y, -a.z, -a.w); +} +inline __host__ __device__ int4 operator-(int4 &a) +{ + return make_int4(-a.x, -a.y, -a.z, -a.w); +} + +//////////////////////////////////////////////////////////////////////////////// +// addition +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float2 operator+(float2 a, float2 b) +{ + return make_float2(a.x + b.x, a.y + b.y); +} +inline __host__ __device__ void operator+=(float2 &a, float2 b) +{ + a.x += b.x; + a.y += b.y; +} +inline __host__ __device__ float2 operator+(float2 a, float b) +{ + return make_float2(a.x + b, a.y + b); +} +inline __host__ __device__ float2 operator+(float b, float2 a) +{ + return make_float2(a.x + b, a.y + b); +} +inline __host__ __device__ void operator+=(float2 &a, float b) +{ + a.x += b; + a.y += b; +} + +inline __host__ __device__ int2 operator+(int2 a, int2 b) +{ + return make_int2(a.x + b.x, a.y + b.y); +} +inline __host__ __device__ void operator+=(int2 &a, int2 b) +{ + a.x += b.x; + a.y += b.y; +} +inline __host__ __device__ int2 operator+(int2 a, int b) +{ + return make_int2(a.x + b, a.y + b); +} +inline __host__ __device__ int2 operator+(int b, int2 a) +{ + return make_int2(a.x + b, a.y + b); +} +inline __host__ __device__ void operator+=(int2 &a, int b) +{ + a.x += b; + a.y += b; +} + +inline __host__ __device__ uint2 operator+(uint2 a, uint2 b) +{ + return make_uint2(a.x + b.x, a.y + b.y); +} +inline __host__ __device__ void operator+=(uint2 &a, uint2 b) +{ + a.x += b.x; + a.y += b.y; +} +inline __host__ __device__ uint2 operator+(uint2 a, uint b) +{ + return make_uint2(a.x + b, a.y + b); +} +inline __host__ __device__ uint2 operator+(uint b, uint2 a) +{ + return make_uint2(a.x + b, a.y + b); +} +inline __host__ __device__ void operator+=(uint2 &a, uint b) +{ + a.x += b; + a.y += b; +} + + +inline __host__ __device__ float3 operator+(float3 a, float3 b) +{ + return make_float3(a.x + b.x, a.y + b.y, a.z + b.z); +} +inline __host__ __device__ void operator+=(float3 &a, float3 b) +{ + a.x += b.x; + a.y += b.y; + a.z += b.z; +} +inline __host__ __device__ float3 operator+(float3 a, float b) +{ + return make_float3(a.x + b, a.y + b, a.z + b); +} +inline __host__ __device__ void operator+=(float3 &a, float b) +{ + a.x += b; + a.y += b; + a.z += b; +} + +inline __host__ __device__ int3 operator+(int3 a, int3 b) +{ + return make_int3(a.x + b.x, a.y + b.y, a.z + b.z); +} +inline __host__ __device__ void operator+=(int3 &a, int3 b) +{ + a.x += b.x; + a.y += b.y; + a.z += b.z; +} +inline __host__ __device__ int3 operator+(int3 a, int b) +{ + return make_int3(a.x + b, a.y + b, a.z + b); +} +inline __host__ __device__ void operator+=(int3 &a, int b) +{ + a.x += b; + a.y += b; + a.z += b; +} + +inline __host__ __device__ uint3 operator+(uint3 a, uint3 b) +{ + return make_uint3(a.x + b.x, a.y + b.y, a.z + b.z); +} +inline __host__ __device__ void operator+=(uint3 &a, uint3 b) +{ + a.x += b.x; + a.y += b.y; + a.z += b.z; +} +inline __host__ __device__ uint3 operator+(uint3 a, uint b) +{ + return make_uint3(a.x + b, a.y + b, a.z + b); +} +inline __host__ __device__ void operator+=(uint3 &a, uint b) +{ + a.x += b; + a.y += b; + a.z += b; +} + +inline __host__ __device__ int3 operator+(int b, int3 a) +{ + return make_int3(a.x + b, a.y + b, a.z + b); +} +inline __host__ __device__ uint3 operator+(uint b, uint3 a) +{ + return make_uint3(a.x + b, a.y + b, a.z + b); +} +inline __host__ __device__ float3 operator+(float b, float3 a) +{ + return make_float3(a.x + b, a.y + b, a.z + b); +} + +inline __host__ __device__ float4 operator+(float4 a, float4 b) +{ + return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); +} +inline __host__ __device__ void operator+=(float4 &a, float4 b) +{ + a.x += b.x; + a.y += b.y; + a.z += b.z; + a.w += b.w; +} +inline __host__ __device__ float4 operator+(float4 a, float b) +{ + return make_float4(a.x + b, a.y + b, a.z + b, a.w + b); +} +inline __host__ __device__ float4 operator+(float b, float4 a) +{ + return make_float4(a.x + b, a.y + b, a.z + b, a.w + b); +} +inline __host__ __device__ void operator+=(float4 &a, float b) +{ + a.x += b; + a.y += b; + a.z += b; + a.w += b; +} + +inline __host__ __device__ int4 operator+(int4 a, int4 b) +{ + return make_int4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); +} +inline __host__ __device__ void operator+=(int4 &a, int4 b) +{ + a.x += b.x; + a.y += b.y; + a.z += b.z; + a.w += b.w; +} +inline __host__ __device__ int4 operator+(int4 a, int b) +{ + return make_int4(a.x + b, a.y + b, a.z + b, a.w + b); +} +inline __host__ __device__ int4 operator+(int b, int4 a) +{ + return make_int4(a.x + b, a.y + b, a.z + b, a.w + b); +} +inline __host__ __device__ void operator+=(int4 &a, int b) +{ + a.x += b; + a.y += b; + a.z += b; + a.w += b; +} + +inline __host__ __device__ uint4 operator+(uint4 a, uint4 b) +{ + return make_uint4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); +} +inline __host__ __device__ void operator+=(uint4 &a, uint4 b) +{ + a.x += b.x; + a.y += b.y; + a.z += b.z; + a.w += b.w; +} +inline __host__ __device__ uint4 operator+(uint4 a, uint b) +{ + return make_uint4(a.x + b, a.y + b, a.z + b, a.w + b); +} +inline __host__ __device__ uint4 operator+(uint b, uint4 a) +{ + return make_uint4(a.x + b, a.y + b, a.z + b, a.w + b); +} +inline __host__ __device__ void operator+=(uint4 &a, uint b) +{ + a.x += b; + a.y += b; + a.z += b; + a.w += b; +} + +//////////////////////////////////////////////////////////////////////////////// +// subtract +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float2 operator-(float2 a, float2 b) +{ + return make_float2(a.x - b.x, a.y - b.y); +} +inline __host__ __device__ void operator-=(float2 &a, float2 b) +{ + a.x -= b.x; + a.y -= b.y; +} +inline __host__ __device__ float2 operator-(float2 a, float b) +{ + return make_float2(a.x - b, a.y - b); +} +inline __host__ __device__ float2 operator-(float b, float2 a) +{ + return make_float2(b - a.x, b - a.y); +} +inline __host__ __device__ void operator-=(float2 &a, float b) +{ + a.x -= b; + a.y -= b; +} + +inline __host__ __device__ int2 operator-(int2 a, int2 b) +{ + return make_int2(a.x - b.x, a.y - b.y); +} +inline __host__ __device__ void operator-=(int2 &a, int2 b) +{ + a.x -= b.x; + a.y -= b.y; +} +inline __host__ __device__ int2 operator-(int2 a, int b) +{ + return make_int2(a.x - b, a.y - b); +} +inline __host__ __device__ int2 operator-(int b, int2 a) +{ + return make_int2(b - a.x, b - a.y); +} +inline __host__ __device__ void operator-=(int2 &a, int b) +{ + a.x -= b; + a.y -= b; +} + +inline __host__ __device__ uint2 operator-(uint2 a, uint2 b) +{ + return make_uint2(a.x - b.x, a.y - b.y); +} +inline __host__ __device__ void operator-=(uint2 &a, uint2 b) +{ + a.x -= b.x; + a.y -= b.y; +} +inline __host__ __device__ uint2 operator-(uint2 a, uint b) +{ + return make_uint2(a.x - b, a.y - b); +} +inline __host__ __device__ uint2 operator-(uint b, uint2 a) +{ + return make_uint2(b - a.x, b - a.y); +} +inline __host__ __device__ void operator-=(uint2 &a, uint b) +{ + a.x -= b; + a.y -= b; +} + +inline __host__ __device__ float3 operator-(float3 a, float3 b) +{ + return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); +} +inline __host__ __device__ void operator-=(float3 &a, float3 b) +{ + a.x -= b.x; + a.y -= b.y; + a.z -= b.z; +} +inline __host__ __device__ float3 operator-(float3 a, float b) +{ + return make_float3(a.x - b, a.y - b, a.z - b); +} +inline __host__ __device__ float3 operator-(float b, float3 a) +{ + return make_float3(b - a.x, b - a.y, b - a.z); +} +inline __host__ __device__ void operator-=(float3 &a, float b) +{ + a.x -= b; + a.y -= b; + a.z -= b; +} + +inline __host__ __device__ int3 operator-(int3 a, int3 b) +{ + return make_int3(a.x - b.x, a.y - b.y, a.z - b.z); +} +inline __host__ __device__ void operator-=(int3 &a, int3 b) +{ + a.x -= b.x; + a.y -= b.y; + a.z -= b.z; +} +inline __host__ __device__ int3 operator-(int3 a, int b) +{ + return make_int3(a.x - b, a.y - b, a.z - b); +} +inline __host__ __device__ int3 operator-(int b, int3 a) +{ + return make_int3(b - a.x, b - a.y, b - a.z); +} +inline __host__ __device__ void operator-=(int3 &a, int b) +{ + a.x -= b; + a.y -= b; + a.z -= b; +} + +inline __host__ __device__ uint3 operator-(uint3 a, uint3 b) +{ + return make_uint3(a.x - b.x, a.y - b.y, a.z - b.z); +} +inline __host__ __device__ void operator-=(uint3 &a, uint3 b) +{ + a.x -= b.x; + a.y -= b.y; + a.z -= b.z; +} +inline __host__ __device__ uint3 operator-(uint3 a, uint b) +{ + return make_uint3(a.x - b, a.y - b, a.z - b); +} +inline __host__ __device__ uint3 operator-(uint b, uint3 a) +{ + return make_uint3(b - a.x, b - a.y, b - a.z); +} +inline __host__ __device__ void operator-=(uint3 &a, uint b) +{ + a.x -= b; + a.y -= b; + a.z -= b; +} + +inline __host__ __device__ float4 operator-(float4 a, float4 b) +{ + return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); +} +inline __host__ __device__ void operator-=(float4 &a, float4 b) +{ + a.x -= b.x; + a.y -= b.y; + a.z -= b.z; + a.w -= b.w; +} +inline __host__ __device__ float4 operator-(float4 a, float b) +{ + return make_float4(a.x - b, a.y - b, a.z - b, a.w - b); +} +inline __host__ __device__ void operator-=(float4 &a, float b) +{ + a.x -= b; + a.y -= b; + a.z -= b; + a.w -= b; +} + +inline __host__ __device__ int4 operator-(int4 a, int4 b) +{ + return make_int4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); +} +inline __host__ __device__ void operator-=(int4 &a, int4 b) +{ + a.x -= b.x; + a.y -= b.y; + a.z -= b.z; + a.w -= b.w; +} +inline __host__ __device__ int4 operator-(int4 a, int b) +{ + return make_int4(a.x - b, a.y - b, a.z - b, a.w - b); +} +inline __host__ __device__ int4 operator-(int b, int4 a) +{ + return make_int4(b - a.x, b - a.y, b - a.z, b - a.w); +} +inline __host__ __device__ void operator-=(int4 &a, int b) +{ + a.x -= b; + a.y -= b; + a.z -= b; + a.w -= b; +} + +inline __host__ __device__ uint4 operator-(uint4 a, uint4 b) +{ + return make_uint4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); +} +inline __host__ __device__ void operator-=(uint4 &a, uint4 b) +{ + a.x -= b.x; + a.y -= b.y; + a.z -= b.z; + a.w -= b.w; +} +inline __host__ __device__ uint4 operator-(uint4 a, uint b) +{ + return make_uint4(a.x - b, a.y - b, a.z - b, a.w - b); +} +inline __host__ __device__ uint4 operator-(uint b, uint4 a) +{ + return make_uint4(b - a.x, b - a.y, b - a.z, b - a.w); +} +inline __host__ __device__ void operator-=(uint4 &a, uint b) +{ + a.x -= b; + a.y -= b; + a.z -= b; + a.w -= b; +} + +//////////////////////////////////////////////////////////////////////////////// +// multiply +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float2 operator*(float2 a, float2 b) +{ + return make_float2(a.x * b.x, a.y * b.y); +} +inline __host__ __device__ void operator*=(float2 &a, float2 b) +{ + a.x *= b.x; + a.y *= b.y; +} +inline __host__ __device__ float2 operator*(float2 a, float b) +{ + return make_float2(a.x * b, a.y * b); +} +inline __host__ __device__ float2 operator*(float b, float2 a) +{ + return make_float2(b * a.x, b * a.y); +} +inline __host__ __device__ void operator*=(float2 &a, float b) +{ + a.x *= b; + a.y *= b; +} + +inline __host__ __device__ int2 operator*(int2 a, int2 b) +{ + return make_int2(a.x * b.x, a.y * b.y); +} +inline __host__ __device__ void operator*=(int2 &a, int2 b) +{ + a.x *= b.x; + a.y *= b.y; +} +inline __host__ __device__ int2 operator*(int2 a, int b) +{ + return make_int2(a.x * b, a.y * b); +} +inline __host__ __device__ int2 operator*(int b, int2 a) +{ + return make_int2(b * a.x, b * a.y); +} +inline __host__ __device__ void operator*=(int2 &a, int b) +{ + a.x *= b; + a.y *= b; +} + +inline __host__ __device__ uint2 operator*(uint2 a, uint2 b) +{ + return make_uint2(a.x * b.x, a.y * b.y); +} +inline __host__ __device__ void operator*=(uint2 &a, uint2 b) +{ + a.x *= b.x; + a.y *= b.y; +} +inline __host__ __device__ uint2 operator*(uint2 a, uint b) +{ + return make_uint2(a.x * b, a.y * b); +} +inline __host__ __device__ uint2 operator*(uint b, uint2 a) +{ + return make_uint2(b * a.x, b * a.y); +} +inline __host__ __device__ void operator*=(uint2 &a, uint b) +{ + a.x *= b; + a.y *= b; +} + +inline __host__ __device__ float3 operator*(float3 a, float3 b) +{ + return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); +} +inline __host__ __device__ void operator*=(float3 &a, float3 b) +{ + a.x *= b.x; + a.y *= b.y; + a.z *= b.z; +} +inline __host__ __device__ float3 operator*(float3 a, float b) +{ + return make_float3(a.x * b, a.y * b, a.z * b); +} +inline __host__ __device__ float3 operator*(float b, float3 a) +{ + return make_float3(b * a.x, b * a.y, b * a.z); +} +inline __host__ __device__ void operator*=(float3 &a, float b) +{ + a.x *= b; + a.y *= b; + a.z *= b; +} + +inline __host__ __device__ int3 operator*(int3 a, int3 b) +{ + return make_int3(a.x * b.x, a.y * b.y, a.z * b.z); +} +inline __host__ __device__ void operator*=(int3 &a, int3 b) +{ + a.x *= b.x; + a.y *= b.y; + a.z *= b.z; +} +inline __host__ __device__ int3 operator*(int3 a, int b) +{ + return make_int3(a.x * b, a.y * b, a.z * b); +} +inline __host__ __device__ int3 operator*(int b, int3 a) +{ + return make_int3(b * a.x, b * a.y, b * a.z); +} +inline __host__ __device__ void operator*=(int3 &a, int b) +{ + a.x *= b; + a.y *= b; + a.z *= b; +} + +inline __host__ __device__ uint3 operator*(uint3 a, uint3 b) +{ + return make_uint3(a.x * b.x, a.y * b.y, a.z * b.z); +} +inline __host__ __device__ void operator*=(uint3 &a, uint3 b) +{ + a.x *= b.x; + a.y *= b.y; + a.z *= b.z; +} +inline __host__ __device__ uint3 operator*(uint3 a, uint b) +{ + return make_uint3(a.x * b, a.y * b, a.z * b); +} +inline __host__ __device__ uint3 operator*(uint b, uint3 a) +{ + return make_uint3(b * a.x, b * a.y, b * a.z); +} +inline __host__ __device__ void operator*=(uint3 &a, uint b) +{ + a.x *= b; + a.y *= b; + a.z *= b; +} + +inline __host__ __device__ float4 operator*(float4 a, float4 b) +{ + return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); +} +inline __host__ __device__ void operator*=(float4 &a, float4 b) +{ + a.x *= b.x; + a.y *= b.y; + a.z *= b.z; + a.w *= b.w; +} +inline __host__ __device__ float4 operator*(float4 a, float b) +{ + return make_float4(a.x * b, a.y * b, a.z * b, a.w * b); +} +inline __host__ __device__ float4 operator*(float b, float4 a) +{ + return make_float4(b * a.x, b * a.y, b * a.z, b * a.w); +} +inline __host__ __device__ void operator*=(float4 &a, float b) +{ + a.x *= b; + a.y *= b; + a.z *= b; + a.w *= b; +} + +inline __host__ __device__ int4 operator*(int4 a, int4 b) +{ + return make_int4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); +} +inline __host__ __device__ void operator*=(int4 &a, int4 b) +{ + a.x *= b.x; + a.y *= b.y; + a.z *= b.z; + a.w *= b.w; +} +inline __host__ __device__ int4 operator*(int4 a, int b) +{ + return make_int4(a.x * b, a.y * b, a.z * b, a.w * b); +} +inline __host__ __device__ int4 operator*(int b, int4 a) +{ + return make_int4(b * a.x, b * a.y, b * a.z, b * a.w); +} +inline __host__ __device__ void operator*=(int4 &a, int b) +{ + a.x *= b; + a.y *= b; + a.z *= b; + a.w *= b; +} + +inline __host__ __device__ uint4 operator*(uint4 a, uint4 b) +{ + return make_uint4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); +} +inline __host__ __device__ void operator*=(uint4 &a, uint4 b) +{ + a.x *= b.x; + a.y *= b.y; + a.z *= b.z; + a.w *= b.w; +} +inline __host__ __device__ uint4 operator*(uint4 a, uint b) +{ + return make_uint4(a.x * b, a.y * b, a.z * b, a.w * b); +} +inline __host__ __device__ uint4 operator*(uint b, uint4 a) +{ + return make_uint4(b * a.x, b * a.y, b * a.z, b * a.w); +} +inline __host__ __device__ void operator*=(uint4 &a, uint b) +{ + a.x *= b; + a.y *= b; + a.z *= b; + a.w *= b; +} + +//////////////////////////////////////////////////////////////////////////////// +// divide +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float2 operator/(float2 a, float2 b) +{ + return make_float2(a.x / b.x, a.y / b.y); +} +inline __host__ __device__ void operator/=(float2 &a, float2 b) +{ + a.x /= b.x; + a.y /= b.y; +} +inline __host__ __device__ float2 operator/(float2 a, float b) +{ + return make_float2(a.x / b, a.y / b); +} +inline __host__ __device__ void operator/=(float2 &a, float b) +{ + a.x /= b; + a.y /= b; +} +inline __host__ __device__ float2 operator/(float b, float2 a) +{ + return make_float2(b / a.x, b / a.y); +} + +inline __host__ __device__ float3 operator/(float3 a, float3 b) +{ + return make_float3(a.x / b.x, a.y / b.y, a.z / b.z); +} +inline __host__ __device__ void operator/=(float3 &a, float3 b) +{ + a.x /= b.x; + a.y /= b.y; + a.z /= b.z; +} +inline __host__ __device__ float3 operator/(float3 a, float b) +{ + return make_float3(a.x / b, a.y / b, a.z / b); +} +inline __host__ __device__ void operator/=(float3 &a, float b) +{ + a.x /= b; + a.y /= b; + a.z /= b; +} +inline __host__ __device__ float3 operator/(float b, float3 a) +{ + return make_float3(b / a.x, b / a.y, b / a.z); +} + +inline __host__ __device__ float4 operator/(float4 a, float4 b) +{ + return make_float4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w); +} +inline __host__ __device__ void operator/=(float4 &a, float4 b) +{ + a.x /= b.x; + a.y /= b.y; + a.z /= b.z; + a.w /= b.w; +} +inline __host__ __device__ float4 operator/(float4 a, float b) +{ + return make_float4(a.x / b, a.y / b, a.z / b, a.w / b); +} +inline __host__ __device__ void operator/=(float4 &a, float b) +{ + a.x /= b; + a.y /= b; + a.z /= b; + a.w /= b; +} +inline __host__ __device__ float4 operator/(float b, float4 a) +{ + return make_float4(b / a.x, b / a.y, b / a.z, b / a.w); +} + +//////////////////////////////////////////////////////////////////////////////// +// min +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float2 fminf(float2 a, float2 b) +{ + return make_float2(fminf(a.x,b.x), fminf(a.y,b.y)); +} +inline __host__ __device__ float3 fminf(float3 a, float3 b) +{ + return make_float3(fminf(a.x,b.x), fminf(a.y,b.y), fminf(a.z,b.z)); +} +inline __host__ __device__ float4 fminf(float4 a, float4 b) +{ + return make_float4(fminf(a.x,b.x), fminf(a.y,b.y), fminf(a.z,b.z), fminf(a.w,b.w)); +} + +inline __host__ __device__ int2 min(int2 a, int2 b) +{ + return make_int2(min(a.x,b.x), min(a.y,b.y)); +} +inline __host__ __device__ int3 min(int3 a, int3 b) +{ + return make_int3(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z)); +} +inline __host__ __device__ int4 min(int4 a, int4 b) +{ + return make_int4(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z), min(a.w,b.w)); +} + +inline __host__ __device__ uint2 min(uint2 a, uint2 b) +{ + return make_uint2(min(a.x,b.x), min(a.y,b.y)); +} +inline __host__ __device__ uint3 min(uint3 a, uint3 b) +{ + return make_uint3(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z)); +} +inline __host__ __device__ uint4 min(uint4 a, uint4 b) +{ + return make_uint4(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z), min(a.w,b.w)); +} + +//////////////////////////////////////////////////////////////////////////////// +// max +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float2 fmaxf(float2 a, float2 b) +{ + return make_float2(fmaxf(a.x,b.x), fmaxf(a.y,b.y)); +} +inline __host__ __device__ float3 fmaxf(float3 a, float3 b) +{ + return make_float3(fmaxf(a.x,b.x), fmaxf(a.y,b.y), fmaxf(a.z,b.z)); +} +inline __host__ __device__ float4 fmaxf(float4 a, float4 b) +{ + return make_float4(fmaxf(a.x,b.x), fmaxf(a.y,b.y), fmaxf(a.z,b.z), fmaxf(a.w,b.w)); +} + +inline __host__ __device__ int2 max(int2 a, int2 b) +{ + return make_int2(max(a.x,b.x), max(a.y,b.y)); +} +inline __host__ __device__ int3 max(int3 a, int3 b) +{ + return make_int3(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z)); +} +inline __host__ __device__ int4 max(int4 a, int4 b) +{ + return make_int4(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z), max(a.w,b.w)); +} + +inline __host__ __device__ uint2 max(uint2 a, uint2 b) +{ + return make_uint2(max(a.x,b.x), max(a.y,b.y)); +} +inline __host__ __device__ uint3 max(uint3 a, uint3 b) +{ + return make_uint3(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z)); +} +inline __host__ __device__ uint4 max(uint4 a, uint4 b) +{ + return make_uint4(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z), max(a.w,b.w)); +} + +//////////////////////////////////////////////////////////////////////////////// +// lerp +// - linear interpolation between a and b, based on value t in [0, 1] range +//////////////////////////////////////////////////////////////////////////////// + +inline __device__ __host__ float lerp(float a, float b, float t) +{ + return a + t*(b-a); +} +inline __device__ __host__ float2 lerp(float2 a, float2 b, float t) +{ + return a + t*(b-a); +} +inline __device__ __host__ float3 lerp(float3 a, float3 b, float t) +{ + return a + t*(b-a); +} +inline __device__ __host__ float4 lerp(float4 a, float4 b, float t) +{ + return a + t*(b-a); +} + +//////////////////////////////////////////////////////////////////////////////// +// clamp +// - clamp the value v to be in the range [a, b] +//////////////////////////////////////////////////////////////////////////////// + +inline __device__ __host__ float clamp(float f, float a, float b) +{ + return fmaxf(a, fminf(f, b)); +} +inline __device__ __host__ int clamp(int f, int a, int b) +{ + return max(a, min(f, b)); +} +inline __device__ __host__ uint clamp(uint f, uint a, uint b) +{ + return max(a, min(f, b)); +} + +inline __device__ __host__ float2 clamp(float2 v, float a, float b) +{ + return make_float2(clamp(v.x, a, b), clamp(v.y, a, b)); +} +inline __device__ __host__ float2 clamp(float2 v, float2 a, float2 b) +{ + return make_float2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y)); +} +inline __device__ __host__ float3 clamp(float3 v, float a, float b) +{ + return make_float3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b)); +} +inline __device__ __host__ float3 clamp(float3 v, float3 a, float3 b) +{ + return make_float3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z)); +} +inline __device__ __host__ float4 clamp(float4 v, float a, float b) +{ + return make_float4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b)); +} +inline __device__ __host__ float4 clamp(float4 v, float4 a, float4 b) +{ + return make_float4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w)); +} + +inline __device__ __host__ int2 clamp(int2 v, int a, int b) +{ + return make_int2(clamp(v.x, a, b), clamp(v.y, a, b)); +} +inline __device__ __host__ int2 clamp(int2 v, int2 a, int2 b) +{ + return make_int2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y)); +} +inline __device__ __host__ int3 clamp(int3 v, int a, int b) +{ + return make_int3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b)); +} +inline __device__ __host__ int3 clamp(int3 v, int3 a, int3 b) +{ + return make_int3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z)); +} +inline __device__ __host__ int4 clamp(int4 v, int a, int b) +{ + return make_int4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b)); +} +inline __device__ __host__ int4 clamp(int4 v, int4 a, int4 b) +{ + return make_int4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w)); +} + +inline __device__ __host__ uint2 clamp(uint2 v, uint a, uint b) +{ + return make_uint2(clamp(v.x, a, b), clamp(v.y, a, b)); +} +inline __device__ __host__ uint2 clamp(uint2 v, uint2 a, uint2 b) +{ + return make_uint2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y)); +} +inline __device__ __host__ uint3 clamp(uint3 v, uint a, uint b) +{ + return make_uint3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b)); +} +inline __device__ __host__ uint3 clamp(uint3 v, uint3 a, uint3 b) +{ + return make_uint3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z)); +} +inline __device__ __host__ uint4 clamp(uint4 v, uint a, uint b) +{ + return make_uint4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b)); +} +inline __device__ __host__ uint4 clamp(uint4 v, uint4 a, uint4 b) +{ + return make_uint4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w)); +} + +//////////////////////////////////////////////////////////////////////////////// +// dot product +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float dot(float2 a, float2 b) +{ + return a.x * b.x + a.y * b.y; +} +inline __host__ __device__ float dot(float3 a, float3 b) +{ + return a.x * b.x + a.y * b.y + a.z * b.z; +} +inline __host__ __device__ float dot(float4 a, float4 b) +{ + return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w; +} + +inline __host__ __device__ int dot(int2 a, int2 b) +{ + return a.x * b.x + a.y * b.y; +} +inline __host__ __device__ int dot(int3 a, int3 b) +{ + return a.x * b.x + a.y * b.y + a.z * b.z; +} +inline __host__ __device__ int dot(int4 a, int4 b) +{ + return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w; +} + +inline __host__ __device__ uint dot(uint2 a, uint2 b) +{ + return a.x * b.x + a.y * b.y; +} +inline __host__ __device__ uint dot(uint3 a, uint3 b) +{ + return a.x * b.x + a.y * b.y + a.z * b.z; +} +inline __host__ __device__ uint dot(uint4 a, uint4 b) +{ + return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w; +} + +//////////////////////////////////////////////////////////////////////////////// +// length +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float length(float2 v) +{ + return sqrtf(dot(v, v)); +} +inline __host__ __device__ float length(float3 v) +{ + return sqrtf(dot(v, v)); +} +inline __host__ __device__ float length(float4 v) +{ + return sqrtf(dot(v, v)); +} + +//////////////////////////////////////////////////////////////////////////////// +// normalize +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float2 normalize(float2 v) +{ + float invLen = rsqrtf(dot(v, v)); + return v * invLen; +} +inline __host__ __device__ float3 normalize(float3 v) +{ + float invLen = rsqrtf(dot(v, v)); + return v * invLen; +} +inline __host__ __device__ float4 normalize(float4 v) +{ + float invLen = rsqrtf(dot(v, v)); + return v * invLen; +} + +//////////////////////////////////////////////////////////////////////////////// +// floor +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float2 floorf(float2 v) +{ + return make_float2(floorf(v.x), floorf(v.y)); +} +inline __host__ __device__ float3 floorf(float3 v) +{ + return make_float3(floorf(v.x), floorf(v.y), floorf(v.z)); +} +inline __host__ __device__ float4 floorf(float4 v) +{ + return make_float4(floorf(v.x), floorf(v.y), floorf(v.z), floorf(v.w)); +} + +//////////////////////////////////////////////////////////////////////////////// +// frac - returns the fractional portion of a scalar or each vector component +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float fracf(float v) +{ + return v - floorf(v); +} +inline __host__ __device__ float2 fracf(float2 v) +{ + return make_float2(fracf(v.x), fracf(v.y)); +} +inline __host__ __device__ float3 fracf(float3 v) +{ + return make_float3(fracf(v.x), fracf(v.y), fracf(v.z)); +} +inline __host__ __device__ float4 fracf(float4 v) +{ + return make_float4(fracf(v.x), fracf(v.y), fracf(v.z), fracf(v.w)); +} + +//////////////////////////////////////////////////////////////////////////////// +// fmod +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float2 fmodf(float2 a, float2 b) +{ + return make_float2(fmodf(a.x, b.x), fmodf(a.y, b.y)); +} +inline __host__ __device__ float3 fmodf(float3 a, float3 b) +{ + return make_float3(fmodf(a.x, b.x), fmodf(a.y, b.y), fmodf(a.z, b.z)); +} +inline __host__ __device__ float4 fmodf(float4 a, float4 b) +{ + return make_float4(fmodf(a.x, b.x), fmodf(a.y, b.y), fmodf(a.z, b.z), fmodf(a.w, b.w)); +} + +//////////////////////////////////////////////////////////////////////////////// +// absolute value +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float2 fabs(float2 v) +{ + return make_float2(fabs(v.x), fabs(v.y)); +} +inline __host__ __device__ float3 fabs(float3 v) +{ + return make_float3(fabs(v.x), fabs(v.y), fabs(v.z)); +} +inline __host__ __device__ float4 fabs(float4 v) +{ + return make_float4(fabs(v.x), fabs(v.y), fabs(v.z), fabs(v.w)); +} + +inline __host__ __device__ int2 abs(int2 v) +{ + return make_int2(abs(v.x), abs(v.y)); +} +inline __host__ __device__ int3 abs(int3 v) +{ + return make_int3(abs(v.x), abs(v.y), abs(v.z)); +} +inline __host__ __device__ int4 abs(int4 v) +{ + return make_int4(abs(v.x), abs(v.y), abs(v.z), abs(v.w)); +} + +//////////////////////////////////////////////////////////////////////////////// +// reflect +// - returns reflection of incident ray I around surface normal N +// - N should be normalized, reflected vector's length is equal to length of I +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float3 reflect(float3 i, float3 n) +{ + return i - 2.0f * n * dot(n,i); +} + +//////////////////////////////////////////////////////////////////////////////// +// cross product +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float3 cross(float3 a, float3 b) +{ + return make_float3(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x); +} + +//////////////////////////////////////////////////////////////////////////////// +// smoothstep +// - returns 0 if x < a +// - returns 1 if x > b +// - otherwise returns smooth interpolation between 0 and 1 based on x +//////////////////////////////////////////////////////////////////////////////// + +inline __device__ __host__ float smoothstep(float a, float b, float x) +{ + float y = clamp((x - a) / (b - a), 0.0f, 1.0f); + return (y*y*(3.0f - (2.0f*y))); +} +inline __device__ __host__ float2 smoothstep(float2 a, float2 b, float2 x) +{ + float2 y = clamp((x - a) / (b - a), 0.0f, 1.0f); + return (y*y*(make_float2(3.0f) - (make_float2(2.0f)*y))); +} +inline __device__ __host__ float3 smoothstep(float3 a, float3 b, float3 x) +{ + float3 y = clamp((x - a) / (b - a), 0.0f, 1.0f); + return (y*y*(make_float3(3.0f) - (make_float3(2.0f)*y))); +} +inline __device__ __host__ float4 smoothstep(float4 a, float4 b, float4 x) +{ + float4 y = clamp((x - a) / (b - a), 0.0f, 1.0f); + return (y*y*(make_float4(3.0f) - (make_float4(2.0f)*y))); +} + +#endif