35bool CUDADevice::have_precompiled_kernels()
37 string cubins_path =
path_get(
"lib");
46void CUDADevice::set_error(
const string &
error)
51 fprintf(stderr,
"\nRefer to the Cycles GPU rendering documentation for possible solutions:\n");
53 "https://docs.blender.org/manual/en/latest/render/cycles/gpu_rendering.html\n\n");
59 :
GPUDevice(info, stats, profiler, headless)
62 static_assert(
sizeof(texMemObject) ==
sizeof(
CUtexObject));
63 static_assert(
sizeof(arrayMemObject) ==
sizeof(CUarray));
73 need_texture_info =
false;
78 CUresult
result = cuInit(0);
79 if (
result != CUDA_SUCCESS) {
80 set_error(
string_printf(
"Failed to initialize CUDA runtime (%s)", cuewErrorString(
result)));
85 result = cuDeviceGet(&cuDevice, cuDevId);
86 if (
result != CUDA_SUCCESS) {
87 set_error(
string_printf(
"Failed to get CUDA device handle from ordinal (%s)",
96 cuda_assert(cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice));
97 can_map_host = value != 0;
99 cuda_assert(cuDeviceGetAttribute(
100 &pitch_alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuDevice));
107 unsigned int ctx_flags = 0;
108 cuda_assert(cuDevicePrimaryCtxGetState(cuDevice, &ctx_flags, &
active));
112 ctx_flags |= CU_CTX_LMEM_RESIZE_TO_MAX;
113 result = cuDevicePrimaryCtxSetFlags(cuDevice, ctx_flags);
114 if (
result != CUDA_SUCCESS &&
result != CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE) {
115 set_error(
string_printf(
"Failed to configure CUDA context (%s)", cuewErrorString(
result)));
121 result = cuDevicePrimaryCtxRetain(&cuContext, cuDevice);
123 if (
result != CUDA_SUCCESS) {
129 cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
130 cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
131 cuDevArchitecture = major * 100 + minor * 10;
134CUDADevice::~CUDADevice()
138 cuda_assert(cuModuleUnload(cuModule));
140 cuda_assert(cuDevicePrimaryCtxRelease(cuDevice));
143bool CUDADevice::support_device(
const uint )
146 cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
147 cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
152 "CUDA backend requires compute capability 3.0 or up, but found %d.%d.", major, minor));
159bool CUDADevice::check_peer_access(
Device *peer_device)
161 if (peer_device ==
this) {
168 CUDADevice *
const peer_device_cuda =
static_cast<CUDADevice *
>(peer_device);
171 cuda_assert(cuDeviceCanAccessPeer(&can_access, cuDevice, peer_device_cuda->cuDevice));
172 if (can_access == 0) {
177 cuda_assert(cuDeviceGetP2PAttribute(&can_access,
178 CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED,
180 peer_device_cuda->cuDevice));
181 if (can_access == 0) {
187 const CUDAContextScope scope(
this);
188 CUresult
result = cuCtxEnablePeerAccess(peer_device_cuda->cuContext, 0);
189 if (
result != CUDA_SUCCESS &&
result != CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED) {
190 set_error(
string_printf(
"Failed to enable peer access on CUDA context (%s)",
191 cuewErrorString(
result)));
196 const CUDAContextScope scope(peer_device_cuda);
197 CUresult
result = cuCtxEnablePeerAccess(cuContext, 0);
198 if (
result != CUDA_SUCCESS &&
result != CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED) {
199 set_error(
string_printf(
"Failed to enable peer access on CUDA context (%s)",
200 cuewErrorString(
result)));
208bool CUDADevice::use_adaptive_compilation()
216string CUDADevice::compile_kernel_get_common_cflags(
const uint kernel_features)
219 const string source_path =
path_get(
"source");
220 const string include_path = source_path;
223 "--ptxas-options=\"-v\" "
228 include_path.c_str());
229 if (use_adaptive_compilation()) {
230 cflags +=
" -D__KERNEL_FEATURES__=" +
to_string(kernel_features);
232 const char *extra_cflags = getenv(
"CYCLES_CUDA_EXTRA_CFLAGS");
234 cflags += string(
" ") + string(extra_cflags);
238 cflags +=
" -DWITH_NANOVDB";
241# ifdef WITH_CYCLES_DEBUG
242 cflags +=
" -DWITH_CYCLES_DEBUG";
248string CUDADevice::compile_kernel(
const string &common_cflags,
255 cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
256 cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
259 if (!use_adaptive_compilation()) {
262 VLOG_INFO <<
"Testing for pre-compiled kernel " << cubin <<
".";
264 VLOG_INFO <<
"Using precompiled kernel.";
270 int ptx_major = major, ptx_minor = minor;
271 while (ptx_major >= 3) {
273 string_printf(
"lib/%s_compute_%d%d.ptx.zst", name, ptx_major, ptx_minor));
274 VLOG_INFO <<
"Testing for pre-compiled kernel " << ptx <<
".";
276 VLOG_INFO <<
"Using precompiled kernel.";
291 string source_path =
path_get(
"source");
299 const char *
const kernel_ext = force_ptx ?
"ptx" :
"cubin";
300 const char *
const kernel_arch = force_ptx ?
"compute" :
"sm";
302 "cycles_%s_%s_%d%d_%s.%s", name, kernel_arch, major, minor, kernel_md5.c_str(), kernel_ext);
304 VLOG_INFO <<
"Testing for locally compiled kernel " << cubin <<
".";
306 VLOG_INFO <<
"Using locally compiled kernel.";
311 if (!use_adaptive_compilation() && have_precompiled_kernels()) {
314 string_printf(
"CUDA backend requires compute capability 3.0 or up, but found %d.%d. "
315 "Your GPU is not supported.",
321 string_printf(
"CUDA binary kernel for this graphics card compute "
322 "capability (%d.%d) not found.",
331 const char *
const nvcc = cuewCompilerPath();
332 if (nvcc ==
nullptr) {
334 "CUDA nvcc compiler not found. "
335 "Install CUDA toolkit in default location.");
339 const int nvcc_cuda_version = cuewCompilerVersion();
340 VLOG_INFO <<
"Found nvcc " << nvcc <<
", CUDA version " << nvcc_cuda_version <<
".";
341 if (nvcc_cuda_version < 101) {
343 "Unsupported CUDA version %d.%d detected, "
344 "you need CUDA 10.1 or newer.\n",
345 nvcc_cuda_version / 10,
346 nvcc_cuda_version % 10);
349 if (!(nvcc_cuda_version >= 102 && nvcc_cuda_version < 130)) {
351 "CUDA version %d.%d detected, build may succeed but only "
352 "CUDA 10.1 to 12 are officially supported.\n",
353 nvcc_cuda_version / 10,
354 nvcc_cuda_version % 10);
377 common_cflags.c_str());
379 printf(
"Compiling %sCUDA kernel ...\n%s\n",
380 (use_adaptive_compilation()) ?
"adaptive " :
"",
384 command =
"call " + command;
386 if (system(command.c_str()) != 0) {
388 "Failed to execute compilation command, "
389 "see console for details.");
396 "CUDA kernel compilation failed, "
397 "see console for details.");
401 printf(
"Kernel compilation finished in %.2lfs.\n",
time_dt() - starttime);
406bool CUDADevice::load_kernels(
const uint kernel_features)
414 if (use_adaptive_compilation()) {
416 <<
"Skipping CUDA kernel reload for adaptive compilation, not currently supported.";
422 if (cuContext ==
nullptr) {
427 if (!support_device(kernel_features)) {
432 const char *kernel_name =
"kernel";
433 string cflags = compile_kernel_get_common_cflags(kernel_features);
434 string cubin = compile_kernel(cflags, kernel_name);
440 CUDAContextScope scope(
this);
446 result = cuModuleLoadData(&cuModule, cubin_data.c_str());
449 result = CUDA_ERROR_FILE_NOT_FOUND;
452 if (
result != CUDA_SUCCESS) {
454 "Failed to load CUDA kernel from '%s' (%s)", cubin.c_str(), cuewErrorString(
result)));
457 if (
result == CUDA_SUCCESS) {
459 reserve_local_memory(kernel_features);
462 return (
result == CUDA_SUCCESS);
465void CUDADevice::reserve_local_memory(
const uint kernel_features)
470 size_t total = 0, free_before = 0, free_after = 0;
473 CUDAContextScope scope(
this);
474 cuMemGetInfo(&free_before, &total);
488 CUDADeviceQueue queue(
this);
495 queue.init_execution();
496 queue.enqueue(test_kernel, 1, args);
501 CUDAContextScope scope(
this);
502 cuMemGetInfo(&free_after, &total);
510 const size_t keep_mb = 1024;
512 while (free_after > keep_mb * 1024 * 1024LL) {
514 cuda_assert(cuMemAlloc(&tmp, 10 * 1024 * 1024LL));
515 cuMemGetInfo(&free_after, &total);
520void CUDADevice::get_device_memory_info(
size_t &total,
size_t &
free)
522 CUDAContextScope scope(
this);
524 cuMemGetInfo(&
free, &total);
527bool CUDADevice::alloc_device(
void *&device_pointer,
const size_t size)
529 CUDAContextScope scope(
this);
531 CUresult mem_alloc_result = cuMemAlloc((CUdeviceptr *)&device_pointer,
size);
532 return mem_alloc_result == CUDA_SUCCESS;
535void CUDADevice::free_device(
void *device_pointer)
537 CUDAContextScope scope(
this);
539 cuda_assert(cuMemFree((CUdeviceptr)device_pointer));
542bool CUDADevice::shared_alloc(
void *&shared_pointer,
const size_t size)
544 CUDAContextScope scope(
this);
546 CUresult mem_alloc_result = cuMemHostAlloc(
547 &shared_pointer,
size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED);
548 return mem_alloc_result == CUDA_SUCCESS;
551void CUDADevice::shared_free(
void *shared_pointer)
553 CUDAContextScope scope(
this);
555 cuMemFreeHost(shared_pointer);
558void *CUDADevice::shared_to_device_pointer(
const void *shared_pointer)
560 CUDAContextScope scope(
this);
561 void *device_pointer =
nullptr;
563 cuMemHostGetDevicePointer_v2((CUdeviceptr *)&device_pointer, (
void *)shared_pointer, 0));
564 return device_pointer;
567void CUDADevice::copy_host_to_device(
void *device_pointer,
void *host_pointer,
const size_t size)
569 const CUDAContextScope scope(
this);
571 cuda_assert(cuMemcpyHtoD((CUdeviceptr)device_pointer, host_pointer,
size));
577 assert(!
"mem_alloc not supported for textures.");
580 assert(!
"mem_alloc not supported for global memory.");
598 generic_copy_to(mem);
601 generic_copy_to(mem);
617 assert(!
"mem_move_to_host only supported for texture and global memory");
621void CUDADevice::mem_copy_from(
622 device_memory &mem,
const size_t y,
size_t w,
const size_t h,
size_t elem)
625 assert(!
"mem_copy_from not supported for textures.");
628 const size_t size = elem *
w * h;
629 const size_t offset = elem *
y *
w;
632 const CUDAContextScope scope(
this);
633 cuda_assert(cuMemcpyDtoH(
652 const CUDAContextScope scope(
this);
678void CUDADevice::const_copy_to(
const char *name,
void *host,
const size_t size)
680 CUDAContextScope scope(
this);
684 cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule,
"kernel_params"));
688# define KERNEL_DATA_ARRAY(data_type, data_name) \
689 if (strcmp(name, #data_name) == 0) { \
690 cuda_assert(cuMemcpyHtoD(mem + offsetof(KernelParamsCUDA, data_name), host, size)); \
695# include "kernel/data_arrays.h"
696# undef KERNEL_DATA_ARRAY
703 generic_copy_to(mem);
713 generic_copy_to(mem);
716 generic_copy_to(mem);
734static CUDA_MEMCPY2D tex_2d_copy_param(
const device_texture &mem,
const int pitch_alignment)
737 const size_t src_pitch = tex_src_pitch(mem);
738 const size_t dst_pitch =
align_up(src_pitch, pitch_alignment);
741 memset(¶m, 0,
sizeof(param));
742 param.dstMemoryType = CU_MEMORYTYPE_DEVICE;
744 param.dstPitch = dst_pitch;
745 param.srcMemoryType = CU_MEMORYTYPE_HOST;
747 param.srcPitch = src_pitch;
748 param.WidthInBytes = param.srcPitch;
756 const size_t src_pitch = tex_src_pitch(mem);
759 memset(¶m, 0,
sizeof(param));
760 param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
762 param.srcMemoryType = CU_MEMORYTYPE_HOST;
764 param.srcPitch = src_pitch;
765 param.WidthInBytes = param.srcPitch;
774 CUDAContextScope scope(
this);
776 CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP;
779 address_mode = CU_TR_ADDRESS_MODE_WRAP;
782 address_mode = CU_TR_ADDRESS_MODE_CLAMP;
785 address_mode = CU_TR_ADDRESS_MODE_BORDER;
788 address_mode = CU_TR_ADDRESS_MODE_MIRROR;
795 CUfilter_mode filter_mode;
797 filter_mode = CU_TR_FILTER_MODE_POINT;
800 filter_mode = CU_TR_FILTER_MODE_LINEAR;
813 CUarray_format_enum
format;
816 format = CU_AD_FORMAT_UNSIGNED_INT8;
819 format = CU_AD_FORMAT_UNSIGNED_INT16;
822 format = CU_AD_FORMAT_FLOAT;
825 format = CU_AD_FORMAT_HALF;
833 CUarray array_3d =
nullptr;
837 cmem = &device_mem_map[&mem];
842 cmem->array =
reinterpret_cast<arrayMemObject
>(array_3d);
847 CUDA_ARRAY3D_DESCRIPTOR desc;
860 cuda_assert(cuArray3DCreate(&array_3d, &desc));
870 const CUDA_MEMCPY3D param = tex_3d_copy_param(mem);
871 cuda_assert(cuMemcpy3D(¶m));
874 cmem = &device_mem_map[&mem];
876 cmem->array =
reinterpret_cast<arrayMemObject
>(array_3d);
880 const size_t dst_pitch =
align_up(tex_src_pitch(mem), pitch_alignment);
881 const size_t dst_size = dst_pitch * mem.
data_height;
883 cmem = generic_alloc(mem, dst_size - mem.
memory_size());
888 const CUDA_MEMCPY2D param = tex_2d_copy_param(mem, pitch_alignment);
889 cuda_assert(cuMemcpy2DUnaligned(¶m));
893 cmem = generic_alloc(mem);
909 CUDA_RESOURCE_DESC resDesc;
910 memset(&resDesc, 0,
sizeof(resDesc));
913 resDesc.resType = CU_RESOURCE_TYPE_ARRAY;
914 resDesc.res.array.hArray = array_3d;
918 const size_t dst_pitch =
align_up(tex_src_pitch(mem), pitch_alignment);
920 resDesc.resType = CU_RESOURCE_TYPE_PITCH2D;
922 resDesc.res.pitch2D.format =
format;
926 resDesc.res.pitch2D.pitchInBytes = dst_pitch;
929 resDesc.resType = CU_RESOURCE_TYPE_LINEAR;
931 resDesc.res.linear.format =
format;
936 CUDA_TEXTURE_DESC texDesc;
937 memset(&texDesc, 0,
sizeof(texDesc));
938 texDesc.addressMode[0] = address_mode;
939 texDesc.addressMode[1] = address_mode;
940 texDesc.addressMode[2] = address_mode;
941 texDesc.filterMode = filter_mode;
944 texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;
947 cmem = &device_mem_map[&mem];
949 cuda_assert(cuTexObjectCreate(&cmem->texobject, &resDesc, &texDesc,
nullptr));
961 if (slot >= texture_info.size()) {
963 texture_info.resize(slot + 128);
965 texture_info[slot] = tex_info;
966 need_texture_info =
true;
978 bool texture_allocated =
false;
981 texture_allocated = mem.
slot < texture_info.size() && texture_info[mem.
slot].data != 0;
983 if (!texture_allocated) {
990 CUDAContextScope scope(
this);
991 const CUDA_MEMCPY3D param = tex_3d_copy_param(mem);
992 cuda_assert(cuMemcpy3D(¶m));
995 CUDAContextScope scope(
this);
996 const CUDA_MEMCPY2D param = tex_2d_copy_param(mem, pitch_alignment);
997 cuda_assert(cuMemcpy2DUnaligned(¶m));
1000 generic_copy_to(mem);
1007 CUDAContextScope scope(
this);
1011 auto it = device_mem_map.find(&mem);
1012 if (it == device_mem_map.end()) {
1016 const Mem &cmem = it->second;
1024 if (cmem.texobject) {
1026 cuTexObjectDestroy(cmem.texobject);
1031 device_mem_map.erase(device_mem_map.find(&mem));
1033 else if (cmem.array) {
1035 cuArrayDestroy(
reinterpret_cast<CUarray
>(cmem.array));
1040 device_mem_map.erase(device_mem_map.find(&mem));
1050 return make_unique<CUDADeviceQueue>(
this);
1063 CUDAContextScope scope(
this);
1065 switch (interop_device.
type) {
1072 int num_all_devices = 0;
1073 cuda_assert(cuDeviceGetCount(&num_all_devices));
1075 if (num_all_devices == 0) {
1080 uint num_gl_devices = 0;
1081 cuGLGetDevices(&num_gl_devices, gl_devices.data(), num_all_devices, CU_GL_DEVICE_LIST_ALL);
1084 for (
uint i = 0;
i < num_gl_devices; ++
i) {
1085 if (gl_devices[
i] == cuDevice) {
1093 VLOG_INFO <<
"Graphics interop: found matching OpenGL device for CUDA";
1096 VLOG_INFO <<
"Graphics interop: no matching OpenGL device for CUDA";
1102 case ccl::GraphicsInteropDevice::VULKAN: {
1105 cuDeviceGetUuid(&uuid, cuDevice);
1106 const bool found = (
sizeof(uuid.bytes) == interop_device.
uuid.size() &&
1107 memcmp(uuid.bytes, interop_device.
uuid.data(),
sizeof(uuid.bytes)) == 0);
1111 VLOG_INFO <<
"Graphics interop: found matching Vulkan device for CUDA";
1114 VLOG_INFO <<
"Graphics interop: no matching Vulkan device for CUDA";
1117 VLOG_INFO <<
"Graphics Interop: CUDA UUID "
1134int CUDADevice::get_num_multiprocessors()
1136 return get_device_default_attribute(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, 0);
1139int CUDADevice::get_max_num_threads_per_multiprocessor()
1141 return get_device_default_attribute(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, 0);
1144bool CUDADevice::get_device_attribute(CUdevice_attribute attribute,
int *value)
1146 CUDAContextScope scope(
this);
1148 return cuDeviceGetAttribute(value, attribute, cuDevice) == CUDA_SUCCESS;
1151int CUDADevice::get_device_default_attribute(CUdevice_attribute attribute,
const int default_value)
1154 if (!get_device_attribute(attribute, &value)) {
1155 return default_value;
void BLI_kdtree_nd_ free(KDTree *tree)
BMesh const char void * data
unsigned long long int uint64_t
static DBVT_INLINE btScalar size(const btDbvtVolume &a)
SIMD_FORCE_INLINE const btScalar & w() const
Return the w value.
virtual void set_error(const string &error)
void mem_alloc(const size_t size)
void mem_free(const size_t size)
bool is_resident(Device *sub_device) const
size_t memory_elements_size(const int elements)
bool is_shared(Device *sub_device) const
device_ptr device_pointer
static constexpr size_t datatype_size(DataType datatype)
#define KERNEL_DATA_ARRAY(type, name)
DebugFlags & DebugFlags()
#define KERNEL_FEATURE_NODE_RAYTRACE
#define KERNEL_FEATURE_MNEE
unsigned long long CUtexObject
#define CCL_NAMESPACE_END
static const char * to_string(const Interpolation &interp)
#define assert(assertion)
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE
string util_md5_string(const string &str)
static void error(const char *str)
string path_cache_get(const string &sub)
string path_get(const string &sub)
string path_files_md5_hash(const string &dir)
string path_join(const string &dir, const string &file)
bool path_exists(const string &path)
void path_create_directories(const string &filepath)
bool path_read_compressed_text(const string &path, string &text)
string string_human_readable_size(size_t size)
string string_hex(const uint8_t *data, const size_t size)
string string_human_readable_number(size_t num)
CCL_NAMESPACE_BEGIN string string_printf(const char *format,...)
std::unique_lock< std::mutex > thread_scoped_lock
CCL_NAMESPACE_BEGIN double time_dt()
ccl_device_inline size_t align_up(const size_t offset, const size_t alignment)
@ IMAGE_DATA_TYPE_NANOVDB_FP16
@ IMAGE_DATA_TYPE_NANOVDB_FLOAT
@ IMAGE_DATA_TYPE_NANOVDB_FLOAT3
@ IMAGE_DATA_TYPE_NANOVDB_FPN