33bool CUDADevice::have_precompiled_kernels()
35 string cubins_path =
path_get(
"lib");
44void CUDADevice::set_error(
const string &
error)
49 fprintf(stderr,
"\nRefer to the Cycles GPU rendering documentation for possible solutions:\n");
51 "https://docs.blender.org/manual/en/latest/render/cycles/gpu_rendering.html\n\n");
57 :
GPUDevice(info, stats, profiler, headless)
60 static_assert(
sizeof(texMemObject) ==
sizeof(
CUtexObject));
61 static_assert(
sizeof(arrayMemObject) ==
sizeof(CUarray));
71 need_texture_info =
false;
76 CUresult
result = cuInit(0);
77 if (
result != CUDA_SUCCESS) {
78 set_error(
string_printf(
"Failed to initialize CUDA runtime (%s)", cuewErrorString(
result)));
83 result = cuDeviceGet(&cuDevice, cuDevId);
84 if (
result != CUDA_SUCCESS) {
85 set_error(
string_printf(
"Failed to get CUDA device handle from ordinal (%s)",
94 cuda_assert(cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice));
95 can_map_host = value != 0;
97 cuda_assert(cuDeviceGetAttribute(
98 &pitch_alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuDevice));
105 unsigned int ctx_flags = 0;
106 cuda_assert(cuDevicePrimaryCtxGetState(cuDevice, &ctx_flags, &active));
110 ctx_flags |= CU_CTX_LMEM_RESIZE_TO_MAX;
111 result = cuDevicePrimaryCtxSetFlags(cuDevice, ctx_flags);
112 if (
result != CUDA_SUCCESS &&
result != CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE) {
113 set_error(
string_printf(
"Failed to configure CUDA context (%s)", cuewErrorString(
result)));
119 result = cuDevicePrimaryCtxRetain(&cuContext, cuDevice);
121 if (
result != CUDA_SUCCESS) {
127 cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
128 cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
129 cuDevArchitecture = major * 100 + minor * 10;
132CUDADevice::~CUDADevice()
136 cuda_assert(cuModuleUnload(cuModule));
138 cuda_assert(cuDevicePrimaryCtxRelease(cuDevice));
141bool CUDADevice::support_device(
const uint )
144 cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
145 cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
150 "CUDA backend requires compute capability 3.0 or up, but found %d.%d.", major, minor));
157bool CUDADevice::check_peer_access(
Device *peer_device)
159 if (peer_device ==
this) {
166 CUDADevice *
const peer_device_cuda =
static_cast<CUDADevice *
>(peer_device);
169 cuda_assert(cuDeviceCanAccessPeer(&can_access, cuDevice, peer_device_cuda->cuDevice));
170 if (can_access == 0) {
175 cuda_assert(cuDeviceGetP2PAttribute(&can_access,
176 CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED,
178 peer_device_cuda->cuDevice));
179 if (can_access == 0) {
185 const CUDAContextScope scope(
this);
186 CUresult
result = cuCtxEnablePeerAccess(peer_device_cuda->cuContext, 0);
187 if (
result != CUDA_SUCCESS &&
result != CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED) {
188 set_error(
string_printf(
"Failed to enable peer access on CUDA context (%s)",
189 cuewErrorString(
result)));
194 const CUDAContextScope scope(peer_device_cuda);
195 CUresult
result = cuCtxEnablePeerAccess(cuContext, 0);
196 if (
result != CUDA_SUCCESS &&
result != CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED) {
197 set_error(
string_printf(
"Failed to enable peer access on CUDA context (%s)",
198 cuewErrorString(
result)));
206bool CUDADevice::use_adaptive_compilation()
214string CUDADevice::compile_kernel_get_common_cflags(
const uint kernel_features)
217 const string source_path =
path_get(
"source");
218 const string include_path = source_path;
221 "--ptxas-options=\"-v\" "
226 include_path.c_str());
227 if (use_adaptive_compilation()) {
228 cflags +=
" -D__KERNEL_FEATURES__=" +
to_string(kernel_features);
230 const char *extra_cflags = getenv(
"CYCLES_CUDA_EXTRA_CFLAGS");
232 cflags += string(
" ") + string(extra_cflags);
236 cflags +=
" -DWITH_NANOVDB";
239# ifdef WITH_CYCLES_DEBUG
240 cflags +=
" -DWITH_CYCLES_DEBUG";
246string CUDADevice::compile_kernel(
const string &common_cflags,
253 cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
254 cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
257 if (!use_adaptive_compilation()) {
260 VLOG_INFO <<
"Testing for pre-compiled kernel " << cubin <<
".";
262 VLOG_INFO <<
"Using precompiled kernel.";
268 int ptx_major = major, ptx_minor = minor;
269 while (ptx_major >= 3) {
271 string_printf(
"lib/%s_compute_%d%d.ptx.zst", name, ptx_major, ptx_minor));
272 VLOG_INFO <<
"Testing for pre-compiled kernel " << ptx <<
".";
274 VLOG_INFO <<
"Using precompiled kernel.";
289 string source_path =
path_get(
"source");
297 const char *
const kernel_ext = force_ptx ?
"ptx" :
"cubin";
298 const char *
const kernel_arch = force_ptx ?
"compute" :
"sm";
300 "cycles_%s_%s_%d%d_%s.%s", name, kernel_arch, major, minor, kernel_md5.c_str(), kernel_ext);
302 VLOG_INFO <<
"Testing for locally compiled kernel " << cubin <<
".";
304 VLOG_INFO <<
"Using locally compiled kernel.";
309 if (!use_adaptive_compilation() && have_precompiled_kernels()) {
312 string_printf(
"CUDA backend requires compute capability 3.0 or up, but found %d.%d. "
313 "Your GPU is not supported.",
319 string_printf(
"CUDA binary kernel for this graphics card compute "
320 "capability (%d.%d) not found.",
329 const char *
const nvcc = cuewCompilerPath();
332 "CUDA nvcc compiler not found. "
333 "Install CUDA toolkit in default location.");
337 const int nvcc_cuda_version = cuewCompilerVersion();
338 VLOG_INFO <<
"Found nvcc " << nvcc <<
", CUDA version " << nvcc_cuda_version <<
".";
339 if (nvcc_cuda_version < 101) {
341 "Unsupported CUDA version %d.%d detected, "
342 "you need CUDA 10.1 or newer.\n",
343 nvcc_cuda_version / 10,
344 nvcc_cuda_version % 10);
347 else if (!(nvcc_cuda_version >= 102 && nvcc_cuda_version < 130)) {
349 "CUDA version %d.%d detected, build may succeed but only "
350 "CUDA 10.1 to 12 are officially supported.\n",
351 nvcc_cuda_version / 10,
352 nvcc_cuda_version % 10);
375 common_cflags.c_str());
377 printf(
"Compiling %sCUDA kernel ...\n%s\n",
378 (use_adaptive_compilation()) ?
"adaptive " :
"",
382 command =
"call " + command;
384 if (system(command.c_str()) != 0) {
386 "Failed to execute compilation command, "
387 "see console for details.");
394 "CUDA kernel compilation failed, "
395 "see console for details.");
399 printf(
"Kernel compilation finished in %.2lfs.\n",
time_dt() - starttime);
404bool CUDADevice::load_kernels(
const uint kernel_features)
412 if (use_adaptive_compilation()) {
414 <<
"Skipping CUDA kernel reload for adaptive compilation, not currently supported.";
420 if (cuContext == 0) {
425 if (!support_device(kernel_features)) {
430 const char *kernel_name =
"kernel";
431 string cflags = compile_kernel_get_common_cflags(kernel_features);
432 string cubin = compile_kernel(cflags, kernel_name);
438 CUDAContextScope scope(
this);
444 result = cuModuleLoadData(&cuModule, cubin_data.c_str());
447 result = CUDA_ERROR_FILE_NOT_FOUND;
450 if (
result != CUDA_SUCCESS) {
452 "Failed to load CUDA kernel from '%s' (%s)", cubin.c_str(), cuewErrorString(
result)));
455 if (
result == CUDA_SUCCESS) {
457 reserve_local_memory(kernel_features);
460 return (
result == CUDA_SUCCESS);
463void CUDADevice::reserve_local_memory(
const uint kernel_features)
468 size_t total = 0, free_before = 0, free_after = 0;
471 CUDAContextScope scope(
this);
472 cuMemGetInfo(&free_before, &total);
486 CUDADeviceQueue
queue(
this);
493 queue.init_execution();
494 queue.enqueue(test_kernel, 1, args);
499 CUDAContextScope scope(
this);
500 cuMemGetInfo(&free_after, &total);
508 const size_t keep_mb = 1024;
510 while (free_after > keep_mb * 1024 * 1024LL) {
512 cuda_assert(cuMemAlloc(&tmp, 10 * 1024 * 1024LL));
513 cuMemGetInfo(&free_after, &total);
518void CUDADevice::get_device_memory_info(
size_t &total,
size_t &
free)
520 CUDAContextScope scope(
this);
522 cuMemGetInfo(&
free, &total);
525bool CUDADevice::alloc_device(
void *&device_pointer,
size_t size)
527 CUDAContextScope scope(
this);
529 CUresult mem_alloc_result = cuMemAlloc((CUdeviceptr *)&device_pointer,
size);
530 return mem_alloc_result == CUDA_SUCCESS;
533void CUDADevice::free_device(
void *device_pointer)
535 CUDAContextScope scope(
this);
537 cuda_assert(cuMemFree((CUdeviceptr)device_pointer));
540bool CUDADevice::alloc_host(
void *&shared_pointer,
size_t size)
542 CUDAContextScope scope(
this);
544 CUresult mem_alloc_result = cuMemHostAlloc(
545 &shared_pointer,
size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED);
546 return mem_alloc_result == CUDA_SUCCESS;
549void CUDADevice::free_host(
void *shared_pointer)
551 CUDAContextScope scope(
this);
553 cuMemFreeHost(shared_pointer);
556void CUDADevice::transform_host_pointer(
void *&device_pointer,
void *&shared_pointer)
558 CUDAContextScope scope(
this);
560 cuda_assert(cuMemHostGetDevicePointer_v2((CUdeviceptr *)&device_pointer, shared_pointer, 0));
563void CUDADevice::copy_host_to_device(
void *device_pointer,
void *host_pointer,
size_t size)
565 const CUDAContextScope scope(
this);
567 cuda_assert(cuMemcpyHtoD((CUdeviceptr)device_pointer, host_pointer,
size));
573 assert(!
"mem_alloc not supported for textures.");
576 assert(!
"mem_alloc not supported for global memory.");
597 generic_copy_to(mem);
601void CUDADevice::mem_copy_from(
device_memory &mem,
size_t y,
size_t w,
size_t h,
size_t elem)
604 assert(!
"mem_copy_from not supported for textures.");
607 const size_t size = elem *
w * h;
608 const size_t offset = elem *
y *
w;
611 const CUDAContextScope scope(
this);
612 cuda_assert(cuMemcpyDtoH(
634 const CUDAContextScope scope(
this);
660void CUDADevice::const_copy_to(
const char *name,
void *host,
size_t size)
662 CUDAContextScope scope(
this);
666 cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule,
"kernel_params"));
670# define KERNEL_DATA_ARRAY(data_type, data_name) \
671 if (strcmp(name, #data_name) == 0) { \
672 cuda_assert(cuMemcpyHtoD(mem + offsetof(KernelParamsCUDA, data_name), host, size)); \
677# include "kernel/data_arrays.h"
678# undef KERNEL_DATA_ARRAY
685 generic_copy_to(mem);
700 CUDAContextScope scope(
this);
705 CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP;
708 address_mode = CU_TR_ADDRESS_MODE_WRAP;
711 address_mode = CU_TR_ADDRESS_MODE_CLAMP;
714 address_mode = CU_TR_ADDRESS_MODE_BORDER;
717 address_mode = CU_TR_ADDRESS_MODE_MIRROR;
724 CUfilter_mode filter_mode;
726 filter_mode = CU_TR_FILTER_MODE_POINT;
729 filter_mode = CU_TR_FILTER_MODE_LINEAR;
742 CUarray_format_enum
format;
745 format = CU_AD_FORMAT_UNSIGNED_INT8;
748 format = CU_AD_FORMAT_UNSIGNED_INT16;
751 format = CU_AD_FORMAT_FLOAT;
754 format = CU_AD_FORMAT_HALF;
762 CUarray array_3d =
NULL;
764 size_t dst_pitch = src_pitch;
768 cmem = &device_mem_map[&mem];
773 cmem->array =
reinterpret_cast<arrayMemObject
>(array_3d);
776 dst_pitch =
align_up(src_pitch, pitch_alignment);
781 CUDA_ARRAY3D_DESCRIPTOR desc;
794 cuda_assert(cuArray3DCreate(&array_3d, &desc));
801 memset(¶m, 0,
sizeof(param));
802 param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
803 param.dstArray = array_3d;
804 param.srcMemoryType = CU_MEMORYTYPE_HOST;
806 param.srcPitch = src_pitch;
807 param.WidthInBytes = param.srcPitch;
811 cuda_assert(cuMemcpy3D(¶m));
818 cmem = &device_mem_map[&mem];
820 cmem->array =
reinterpret_cast<arrayMemObject
>(array_3d);
824 dst_pitch =
align_up(src_pitch, pitch_alignment);
827 cmem = generic_alloc(mem, dst_size - mem.
memory_size());
833 memset(¶m, 0,
sizeof(param));
834 param.dstMemoryType = CU_MEMORYTYPE_DEVICE;
836 param.dstPitch = dst_pitch;
837 param.srcMemoryType = CU_MEMORYTYPE_HOST;
839 param.srcPitch = src_pitch;
840 param.WidthInBytes = param.srcPitch;
843 cuda_assert(cuMemcpy2DUnaligned(¶m));
847 cmem = generic_alloc(mem);
857 if (slot >= texture_info.size()) {
860 texture_info.resize(slot + 128);
864 texture_info[slot] = mem.
info;
865 need_texture_info =
true;
872 CUDA_RESOURCE_DESC resDesc;
873 memset(&resDesc, 0,
sizeof(resDesc));
876 resDesc.resType = CU_RESOURCE_TYPE_ARRAY;
877 resDesc.res.array.hArray = array_3d;
881 resDesc.resType = CU_RESOURCE_TYPE_PITCH2D;
883 resDesc.res.pitch2D.format =
format;
887 resDesc.res.pitch2D.pitchInBytes = dst_pitch;
890 resDesc.resType = CU_RESOURCE_TYPE_LINEAR;
892 resDesc.res.linear.format =
format;
897 CUDA_TEXTURE_DESC texDesc;
898 memset(&texDesc, 0,
sizeof(texDesc));
899 texDesc.addressMode[0] = address_mode;
900 texDesc.addressMode[1] = address_mode;
901 texDesc.addressMode[2] = address_mode;
902 texDesc.filterMode = filter_mode;
905 texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;
908 cmem = &device_mem_map[&mem];
910 cuda_assert(cuTexObjectCreate(&cmem->texobject, &resDesc, &texDesc,
NULL));
912 texture_info[slot].data = (
uint64_t)cmem->texobject;
922 CUDAContextScope scope(
this);
924 DCHECK(device_mem_map.find(&mem) != device_mem_map.end());
925 const Mem &cmem = device_mem_map[&mem];
927 if (cmem.texobject) {
929 cuTexObjectDestroy(cmem.texobject);
934 device_mem_map.erase(device_mem_map.find(&mem));
936 else if (cmem.array) {
938 cuArrayDestroy(
reinterpret_cast<CUarray
>(cmem.array));
943 device_mem_map.erase(device_mem_map.find(&mem));
954 return make_unique<CUDADeviceQueue>(
this);
957bool CUDADevice::should_use_graphics_interop()
972 CUDAContextScope scope(
this);
974 int num_all_devices = 0;
975 cuda_assert(cuDeviceGetCount(&num_all_devices));
977 if (num_all_devices == 0) {
982 uint num_gl_devices = 0;
983 cuGLGetDevices(&num_gl_devices, gl_devices.data(), num_all_devices, CU_GL_DEVICE_LIST_ALL);
985 for (
uint i = 0; i < num_gl_devices; ++i) {
986 if (gl_devices[i] == cuDevice) {
994int CUDADevice::get_num_multiprocessors()
996 return get_device_default_attribute(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, 0);
999int CUDADevice::get_max_num_threads_per_multiprocessor()
1001 return get_device_default_attribute(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, 0);
1004bool CUDADevice::get_device_attribute(CUdevice_attribute
attribute,
int *value)
1006 CUDAContextScope scope(
this);
1008 return cuDeviceGetAttribute(value,
attribute, cuDevice) == CUDA_SUCCESS;
1011int CUDADevice::get_device_default_attribute(CUdevice_attribute
attribute,
int default_value)
1014 if (!get_device_attribute(
attribute, &value)) {
1015 return default_value;
void BLI_kdtree_nd_ free(KDTree *tree)
in reality light always falls off quadratically Particle Retrieve the data of the particle that spawned the object for example to give variation to multiple instances of an object Point Retrieve information about points in a point cloud Retrieve the edges of an object as it appears to Cycles topology will always appear triangulated Convert a blackbody temperature to an RGB value Normal Generate a perturbed normal from an RGB normal map image Typically used for faking highly detailed surfaces Generate an OSL shader from a file or text data block Image Sample an image file as a texture Gabor Generate Gabor noise Gradient Generate interpolated color and intensity values based on the input vector Magic Generate a psychedelic color texture Voronoi Generate Worley noise based on the distance to random points Typically used to generate textures such as or biological cells Brick Generate a procedural texture producing bricks Texture Retrieve multiple types of texture coordinates nTypically used as inputs for texture nodes Vector Convert a or normal between and object coordinate space Combine Create a color from its and value channels Color Retrieve a color attribute
static DBVT_INLINE btScalar size(const btDbvtVolume &a)
SIMD_FORCE_INLINE const btScalar & w() const
Return the w value.
virtual void set_error(const string &error)
void mem_free(size_t size)
void mem_alloc(size_t size)
bool is_resident(Device *sub_device) const
size_t memory_elements_size(int elements)
device_ptr device_pointer
static constexpr size_t datatype_size(DataType datatype)
#define KERNEL_DATA_ARRAY(type, name)
DebugFlags & DebugFlags()
unsigned long long CUtexObject
#define CCL_NAMESPACE_END
static const char * to_string(const Interpolation &interp)
#define KERNEL_FEATURE_NODE_RAYTRACE
#define KERNEL_FEATURE_MNEE
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE
#define DCHECK(expression)
string util_md5_string(const string &str)
static void error(const char *str)
ThreadQueue * queue
all scheduled work for the cpu
string path_cache_get(const string &sub)
string path_get(const string &sub)
string path_files_md5_hash(const string &dir)
string path_join(const string &dir, const string &file)
bool path_exists(const string &path)
void path_create_directories(const string &filepath)
bool path_read_compressed_text(const string &path, string &text)
unsigned __int64 uint64_t
string string_human_readable_size(size_t size)
string string_human_readable_number(size_t num)
CCL_NAMESPACE_BEGIN string string_printf(const char *format,...)
std::unique_lock< std::mutex > thread_scoped_lock
CCL_NAMESPACE_BEGIN double time_dt()
@ IMAGE_DATA_TYPE_NANOVDB_FP16
@ IMAGE_DATA_TYPE_NANOVDB_FLOAT
@ IMAGE_DATA_TYPE_NANOVDB_FLOAT3
@ IMAGE_DATA_TYPE_NANOVDB_FPN
ccl_device_inline size_t align_up(size_t offset, size_t alignment)