Blender V4.3
cuda/device_impl.cpp
Go to the documentation of this file.
1/* SPDX-FileCopyrightText: 2011-2022 Blender Foundation
2 *
3 * SPDX-License-Identifier: Apache-2.0 */
4
5#ifdef WITH_CUDA
6
7# include <climits>
8# include <limits.h>
9# include <stdio.h>
10# include <stdlib.h>
11# include <string.h>
12
14
15# include "util/debug.h"
16# include "util/foreach.h"
17# include "util/log.h"
18# include "util/map.h"
19# include "util/md5.h"
20# include "util/path.h"
21# include "util/string.h"
22# include "util/system.h"
23# include "util/time.h"
24# include "util/types.h"
25# include "util/windows.h"
26
28
30
31class CUDADevice;
32
33bool CUDADevice::have_precompiled_kernels()
34{
35 string cubins_path = path_get("lib");
36 return path_exists(cubins_path);
37}
38
39BVHLayoutMask CUDADevice::get_bvh_layout_mask(uint /*kernel_features*/) const
40{
41 return BVH_LAYOUT_BVH2;
42}
43
44void CUDADevice::set_error(const string &error)
45{
47
48 if (first_error) {
49 fprintf(stderr, "\nRefer to the Cycles GPU rendering documentation for possible solutions:\n");
50 fprintf(stderr,
51 "https://docs.blender.org/manual/en/latest/render/cycles/gpu_rendering.html\n\n");
52 first_error = false;
53 }
54}
55
56CUDADevice::CUDADevice(const DeviceInfo &info, Stats &stats, Profiler &profiler, bool headless)
57 : GPUDevice(info, stats, profiler, headless)
58{
59 /* Verify that base class types can be used with specific backend types */
60 static_assert(sizeof(texMemObject) == sizeof(CUtexObject));
61 static_assert(sizeof(arrayMemObject) == sizeof(CUarray));
62
63 first_error = true;
64
65 cuDevId = info.num;
66 cuDevice = 0;
67 cuContext = 0;
68
69 cuModule = 0;
70
71 need_texture_info = false;
72
73 pitch_alignment = 0;
74
75 /* Initialize CUDA. */
76 CUresult result = cuInit(0);
77 if (result != CUDA_SUCCESS) {
78 set_error(string_printf("Failed to initialize CUDA runtime (%s)", cuewErrorString(result)));
79 return;
80 }
81
82 /* Setup device and context. */
83 result = cuDeviceGet(&cuDevice, cuDevId);
84 if (result != CUDA_SUCCESS) {
85 set_error(string_printf("Failed to get CUDA device handle from ordinal (%s)",
86 cuewErrorString(result)));
87 return;
88 }
89
90 /* CU_CTX_MAP_HOST for mapping host memory when out of device memory.
91 * CU_CTX_LMEM_RESIZE_TO_MAX for reserving local memory ahead of render,
92 * so we can predict which memory to map to host. */
93 int value;
94 cuda_assert(cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice));
95 can_map_host = value != 0;
96
97 cuda_assert(cuDeviceGetAttribute(
98 &pitch_alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuDevice));
99
100 if (can_map_host) {
101 init_host_memory();
102 }
103
104 int active = 0;
105 unsigned int ctx_flags = 0;
106 cuda_assert(cuDevicePrimaryCtxGetState(cuDevice, &ctx_flags, &active));
107
108 /* Configure primary context only once. */
109 if (active == 0) {
110 ctx_flags |= CU_CTX_LMEM_RESIZE_TO_MAX;
111 result = cuDevicePrimaryCtxSetFlags(cuDevice, ctx_flags);
112 if (result != CUDA_SUCCESS && result != CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE) {
113 set_error(string_printf("Failed to configure CUDA context (%s)", cuewErrorString(result)));
114 return;
115 }
116 }
117
118 /* Create context. */
119 result = cuDevicePrimaryCtxRetain(&cuContext, cuDevice);
120
121 if (result != CUDA_SUCCESS) {
122 set_error(string_printf("Failed to retain CUDA context (%s)", cuewErrorString(result)));
123 return;
124 }
125
126 int major, minor;
127 cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
128 cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
129 cuDevArchitecture = major * 100 + minor * 10;
130}
131
132CUDADevice::~CUDADevice()
133{
134 texture_info.free();
135 if (cuModule) {
136 cuda_assert(cuModuleUnload(cuModule));
137 }
138 cuda_assert(cuDevicePrimaryCtxRelease(cuDevice));
139}
140
141bool CUDADevice::support_device(const uint /*kernel_features*/)
142{
143 int major, minor;
144 cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
145 cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
146
147 /* We only support sm_30 and above */
148 if (major < 3) {
149 set_error(string_printf(
150 "CUDA backend requires compute capability 3.0 or up, but found %d.%d.", major, minor));
151 return false;
152 }
153
154 return true;
155}
156
157bool CUDADevice::check_peer_access(Device *peer_device)
158{
159 if (peer_device == this) {
160 return false;
161 }
162 if (peer_device->info.type != DEVICE_CUDA && peer_device->info.type != DEVICE_OPTIX) {
163 return false;
164 }
165
166 CUDADevice *const peer_device_cuda = static_cast<CUDADevice *>(peer_device);
167
168 int can_access = 0;
169 cuda_assert(cuDeviceCanAccessPeer(&can_access, cuDevice, peer_device_cuda->cuDevice));
170 if (can_access == 0) {
171 return false;
172 }
173
174 // Ensure array access over the link is possible as well (for 3D textures)
175 cuda_assert(cuDeviceGetP2PAttribute(&can_access,
176 CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED,
177 cuDevice,
178 peer_device_cuda->cuDevice));
179 if (can_access == 0) {
180 return false;
181 }
182
183 // Enable peer access in both directions
184 {
185 const CUDAContextScope scope(this);
186 CUresult result = cuCtxEnablePeerAccess(peer_device_cuda->cuContext, 0);
187 if (result != CUDA_SUCCESS && result != CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED) {
188 set_error(string_printf("Failed to enable peer access on CUDA context (%s)",
189 cuewErrorString(result)));
190 return false;
191 }
192 }
193 {
194 const CUDAContextScope scope(peer_device_cuda);
195 CUresult result = cuCtxEnablePeerAccess(cuContext, 0);
196 if (result != CUDA_SUCCESS && result != CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED) {
197 set_error(string_printf("Failed to enable peer access on CUDA context (%s)",
198 cuewErrorString(result)));
199 return false;
200 }
201 }
202
203 return true;
204}
205
206bool CUDADevice::use_adaptive_compilation()
207{
209}
210
211/* Common NVCC flags which stays the same regardless of shading model,
212 * kernel sources md5 and only depends on compiler or compilation settings.
213 */
214string CUDADevice::compile_kernel_get_common_cflags(const uint kernel_features)
215{
216 const int machine = system_cpu_bits();
217 const string source_path = path_get("source");
218 const string include_path = source_path;
219 string cflags = string_printf(
220 "-m%d "
221 "--ptxas-options=\"-v\" "
222 "--use_fast_math "
223 "-DNVCC "
224 "-I\"%s\"",
225 machine,
226 include_path.c_str());
227 if (use_adaptive_compilation()) {
228 cflags += " -D__KERNEL_FEATURES__=" + to_string(kernel_features);
229 }
230 const char *extra_cflags = getenv("CYCLES_CUDA_EXTRA_CFLAGS");
231 if (extra_cflags) {
232 cflags += string(" ") + string(extra_cflags);
233 }
234
235# ifdef WITH_NANOVDB
236 cflags += " -DWITH_NANOVDB";
237# endif
238
239# ifdef WITH_CYCLES_DEBUG
240 cflags += " -DWITH_CYCLES_DEBUG";
241# endif
242
243 return cflags;
244}
245
246string CUDADevice::compile_kernel(const string &common_cflags,
247 const char *name,
248 const char *base,
249 bool force_ptx)
250{
251 /* Compute kernel name. */
252 int major, minor;
253 cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
254 cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
255
256 /* Attempt to use kernel provided with Blender. */
257 if (!use_adaptive_compilation()) {
258 if (!force_ptx) {
259 const string cubin = path_get(string_printf("lib/%s_sm_%d%d.cubin.zst", name, major, minor));
260 VLOG_INFO << "Testing for pre-compiled kernel " << cubin << ".";
261 if (path_exists(cubin)) {
262 VLOG_INFO << "Using precompiled kernel.";
263 return cubin;
264 }
265 }
266
267 /* The driver can JIT-compile PTX generated for older generations, so find the closest one. */
268 int ptx_major = major, ptx_minor = minor;
269 while (ptx_major >= 3) {
270 const string ptx = path_get(
271 string_printf("lib/%s_compute_%d%d.ptx.zst", name, ptx_major, ptx_minor));
272 VLOG_INFO << "Testing for pre-compiled kernel " << ptx << ".";
273 if (path_exists(ptx)) {
274 VLOG_INFO << "Using precompiled kernel.";
275 return ptx;
276 }
277
278 if (ptx_minor > 0) {
279 ptx_minor--;
280 }
281 else {
282 ptx_major--;
283 ptx_minor = 9;
284 }
285 }
286 }
287
288 /* Try to use locally compiled kernel. */
289 string source_path = path_get("source");
290 const string source_md5 = path_files_md5_hash(source_path);
291
292 /* We include cflags into md5 so changing cuda toolkit or changing other
293 * compiler command line arguments makes sure cubin gets re-built.
294 */
295 const string kernel_md5 = util_md5_string(source_md5 + common_cflags);
296
297 const char *const kernel_ext = force_ptx ? "ptx" : "cubin";
298 const char *const kernel_arch = force_ptx ? "compute" : "sm";
299 const string cubin_file = string_printf(
300 "cycles_%s_%s_%d%d_%s.%s", name, kernel_arch, major, minor, kernel_md5.c_str(), kernel_ext);
301 const string cubin = path_cache_get(path_join("kernels", cubin_file));
302 VLOG_INFO << "Testing for locally compiled kernel " << cubin << ".";
303 if (path_exists(cubin)) {
304 VLOG_INFO << "Using locally compiled kernel.";
305 return cubin;
306 }
307
308# ifdef _WIN32
309 if (!use_adaptive_compilation() && have_precompiled_kernels()) {
310 if (major < 3) {
311 set_error(
312 string_printf("CUDA backend requires compute capability 3.0 or up, but found %d.%d. "
313 "Your GPU is not supported.",
314 major,
315 minor));
316 }
317 else {
318 set_error(
319 string_printf("CUDA binary kernel for this graphics card compute "
320 "capability (%d.%d) not found.",
321 major,
322 minor));
323 }
324 return string();
325 }
326# endif
327
328 /* Compile. */
329 const char *const nvcc = cuewCompilerPath();
330 if (nvcc == NULL) {
331 set_error(
332 "CUDA nvcc compiler not found. "
333 "Install CUDA toolkit in default location.");
334 return string();
335 }
336
337 const int nvcc_cuda_version = cuewCompilerVersion();
338 VLOG_INFO << "Found nvcc " << nvcc << ", CUDA version " << nvcc_cuda_version << ".";
339 if (nvcc_cuda_version < 101) {
340 printf(
341 "Unsupported CUDA version %d.%d detected, "
342 "you need CUDA 10.1 or newer.\n",
343 nvcc_cuda_version / 10,
344 nvcc_cuda_version % 10);
345 return string();
346 }
347 else if (!(nvcc_cuda_version >= 102 && nvcc_cuda_version < 130)) {
348 printf(
349 "CUDA version %d.%d detected, build may succeed but only "
350 "CUDA 10.1 to 12 are officially supported.\n",
351 nvcc_cuda_version / 10,
352 nvcc_cuda_version % 10);
353 }
354
355 double starttime = time_dt();
356
358
359 source_path = path_join(path_join(source_path, "kernel"),
360 path_join("device", path_join(base, string_printf("%s.cu", name))));
361
362 string command = string_printf(
363 "\"%s\" "
364 "-arch=%s_%d%d "
365 "--%s \"%s\" "
366 "-o \"%s\" "
367 "%s",
368 nvcc,
369 kernel_arch,
370 major,
371 minor,
372 kernel_ext,
373 source_path.c_str(),
374 cubin.c_str(),
375 common_cflags.c_str());
376
377 printf("Compiling %sCUDA kernel ...\n%s\n",
378 (use_adaptive_compilation()) ? "adaptive " : "",
379 command.c_str());
380
381# ifdef _WIN32
382 command = "call " + command;
383# endif
384 if (system(command.c_str()) != 0) {
385 set_error(
386 "Failed to execute compilation command, "
387 "see console for details.");
388 return string();
389 }
390
391 /* Verify if compilation succeeded */
392 if (!path_exists(cubin)) {
393 set_error(
394 "CUDA kernel compilation failed, "
395 "see console for details.");
396 return string();
397 }
398
399 printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime);
400
401 return cubin;
402}
403
404bool CUDADevice::load_kernels(const uint kernel_features)
405{
406 /* TODO(sergey): Support kernels re-load for CUDA devices adaptive compile.
407 *
408 * Currently re-loading kernel will invalidate memory pointers,
409 * causing problems in cuCtxSynchronize.
410 */
411 if (cuModule) {
412 if (use_adaptive_compilation()) {
414 << "Skipping CUDA kernel reload for adaptive compilation, not currently supported.";
415 }
416 return true;
417 }
418
419 /* check if cuda init succeeded */
420 if (cuContext == 0) {
421 return false;
422 }
423
424 /* check if GPU is supported */
425 if (!support_device(kernel_features)) {
426 return false;
427 }
428
429 /* get kernel */
430 const char *kernel_name = "kernel";
431 string cflags = compile_kernel_get_common_cflags(kernel_features);
432 string cubin = compile_kernel(cflags, kernel_name);
433 if (cubin.empty()) {
434 return false;
435 }
436
437 /* open module */
438 CUDAContextScope scope(this);
439
440 string cubin_data;
441 CUresult result;
442
443 if (path_read_compressed_text(cubin, cubin_data)) {
444 result = cuModuleLoadData(&cuModule, cubin_data.c_str());
445 }
446 else {
447 result = CUDA_ERROR_FILE_NOT_FOUND;
448 }
449
450 if (result != CUDA_SUCCESS) {
451 set_error(string_printf(
452 "Failed to load CUDA kernel from '%s' (%s)", cubin.c_str(), cuewErrorString(result)));
453 }
454
455 if (result == CUDA_SUCCESS) {
456 kernels.load(this);
457 reserve_local_memory(kernel_features);
458 }
459
460 return (result == CUDA_SUCCESS);
461}
462
463void CUDADevice::reserve_local_memory(const uint kernel_features)
464{
465 /* Together with CU_CTX_LMEM_RESIZE_TO_MAX, this reserves local memory
466 * needed for kernel launches, so that we can reliably figure out when
467 * to allocate scene data in mapped host memory. */
468 size_t total = 0, free_before = 0, free_after = 0;
469
470 {
471 CUDAContextScope scope(this);
472 cuMemGetInfo(&free_before, &total);
473 }
474
475 {
476 /* Use the biggest kernel for estimation. */
477 const DeviceKernel test_kernel = (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) ?
479 (kernel_features & KERNEL_FEATURE_MNEE) ?
482
483 /* Launch kernel, using just 1 block appears sufficient to reserve memory for all
484 * multiprocessors. It would be good to do this in parallel for the multi GPU case
485 * still to make it faster. */
486 CUDADeviceQueue queue(this);
487
488 device_ptr d_path_index = 0;
489 device_ptr d_render_buffer = 0;
490 int d_work_size = 0;
491 DeviceKernelArguments args(&d_path_index, &d_render_buffer, &d_work_size);
492
493 queue.init_execution();
494 queue.enqueue(test_kernel, 1, args);
495 queue.synchronize();
496 }
497
498 {
499 CUDAContextScope scope(this);
500 cuMemGetInfo(&free_after, &total);
501 }
502
503 VLOG_INFO << "Local memory reserved " << string_human_readable_number(free_before - free_after)
504 << " bytes. (" << string_human_readable_size(free_before - free_after) << ")";
505
506# if 0
507 /* For testing mapped host memory, fill up device memory. */
508 const size_t keep_mb = 1024;
509
510 while (free_after > keep_mb * 1024 * 1024LL) {
511 CUdeviceptr tmp;
512 cuda_assert(cuMemAlloc(&tmp, 10 * 1024 * 1024LL));
513 cuMemGetInfo(&free_after, &total);
514 }
515# endif
516}
517
518void CUDADevice::get_device_memory_info(size_t &total, size_t &free)
519{
520 CUDAContextScope scope(this);
521
522 cuMemGetInfo(&free, &total);
523}
524
525bool CUDADevice::alloc_device(void *&device_pointer, size_t size)
526{
527 CUDAContextScope scope(this);
528
529 CUresult mem_alloc_result = cuMemAlloc((CUdeviceptr *)&device_pointer, size);
530 return mem_alloc_result == CUDA_SUCCESS;
531}
532
533void CUDADevice::free_device(void *device_pointer)
534{
535 CUDAContextScope scope(this);
536
537 cuda_assert(cuMemFree((CUdeviceptr)device_pointer));
538}
539
540bool CUDADevice::alloc_host(void *&shared_pointer, size_t size)
541{
542 CUDAContextScope scope(this);
543
544 CUresult mem_alloc_result = cuMemHostAlloc(
545 &shared_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED);
546 return mem_alloc_result == CUDA_SUCCESS;
547}
548
549void CUDADevice::free_host(void *shared_pointer)
550{
551 CUDAContextScope scope(this);
552
553 cuMemFreeHost(shared_pointer);
554}
555
556void CUDADevice::transform_host_pointer(void *&device_pointer, void *&shared_pointer)
557{
558 CUDAContextScope scope(this);
559
560 cuda_assert(cuMemHostGetDevicePointer_v2((CUdeviceptr *)&device_pointer, shared_pointer, 0));
561}
562
563void CUDADevice::copy_host_to_device(void *device_pointer, void *host_pointer, size_t size)
564{
565 const CUDAContextScope scope(this);
566
567 cuda_assert(cuMemcpyHtoD((CUdeviceptr)device_pointer, host_pointer, size));
568}
569
570void CUDADevice::mem_alloc(device_memory &mem)
571{
572 if (mem.type == MEM_TEXTURE) {
573 assert(!"mem_alloc not supported for textures.");
574 }
575 else if (mem.type == MEM_GLOBAL) {
576 assert(!"mem_alloc not supported for global memory.");
577 }
578 else {
579 generic_alloc(mem);
580 }
581}
582
583void CUDADevice::mem_copy_to(device_memory &mem)
584{
585 if (mem.type == MEM_GLOBAL) {
586 global_free(mem);
587 global_alloc(mem);
588 }
589 else if (mem.type == MEM_TEXTURE) {
590 tex_free((device_texture &)mem);
591 tex_alloc((device_texture &)mem);
592 }
593 else {
594 if (!mem.device_pointer) {
595 generic_alloc(mem);
596 }
597 generic_copy_to(mem);
598 }
599}
600
601void CUDADevice::mem_copy_from(device_memory &mem, size_t y, size_t w, size_t h, size_t elem)
602{
603 if (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) {
604 assert(!"mem_copy_from not supported for textures.");
605 }
606 else if (mem.host_pointer) {
607 const size_t size = elem * w * h;
608 const size_t offset = elem * y * w;
609
610 if (mem.device_pointer) {
611 const CUDAContextScope scope(this);
612 cuda_assert(cuMemcpyDtoH(
613 (char *)mem.host_pointer + offset, (CUdeviceptr)mem.device_pointer + offset, size));
614 }
615 else {
616 memset((char *)mem.host_pointer + offset, 0, size);
617 }
618 }
619}
620
621void CUDADevice::mem_zero(device_memory &mem)
622{
623 if (!mem.device_pointer) {
624 mem_alloc(mem);
625 }
626 if (!mem.device_pointer) {
627 return;
628 }
629
630 /* If use_mapped_host of mem is false, mem.device_pointer currently refers to device memory
631 * regardless of mem.host_pointer and mem.shared_pointer. */
632 thread_scoped_lock lock(device_mem_map_mutex);
633 if (!device_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
634 const CUDAContextScope scope(this);
635 cuda_assert(cuMemsetD8((CUdeviceptr)mem.device_pointer, 0, mem.memory_size()));
636 }
637 else if (mem.host_pointer) {
638 memset(mem.host_pointer, 0, mem.memory_size());
639 }
640}
641
642void CUDADevice::mem_free(device_memory &mem)
643{
644 if (mem.type == MEM_GLOBAL) {
645 global_free(mem);
646 }
647 else if (mem.type == MEM_TEXTURE) {
648 tex_free((device_texture &)mem);
649 }
650 else {
651 generic_free(mem);
652 }
653}
654
655device_ptr CUDADevice::mem_alloc_sub_ptr(device_memory &mem, size_t offset, size_t /*size*/)
656{
657 return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset));
658}
659
660void CUDADevice::const_copy_to(const char *name, void *host, size_t size)
661{
662 CUDAContextScope scope(this);
663 CUdeviceptr mem;
664 size_t bytes;
665
666 cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, "kernel_params"));
667 assert(bytes == sizeof(KernelParamsCUDA));
668
669 /* Update data storage pointers in launch parameters. */
670# define KERNEL_DATA_ARRAY(data_type, data_name) \
671 if (strcmp(name, #data_name) == 0) { \
672 cuda_assert(cuMemcpyHtoD(mem + offsetof(KernelParamsCUDA, data_name), host, size)); \
673 return; \
674 }
676 KERNEL_DATA_ARRAY(IntegratorStateGPU, integrator_state)
677# include "kernel/data_arrays.h"
678# undef KERNEL_DATA_ARRAY
679}
680
681void CUDADevice::global_alloc(device_memory &mem)
682{
683 if (mem.is_resident(this)) {
684 generic_alloc(mem);
685 generic_copy_to(mem);
686 }
687
688 const_copy_to(mem.name, &mem.device_pointer, sizeof(mem.device_pointer));
689}
690
691void CUDADevice::global_free(device_memory &mem)
692{
693 if (mem.is_resident(this) && mem.device_pointer) {
694 generic_free(mem);
695 }
696}
697
698void CUDADevice::tex_alloc(device_texture &mem)
699{
700 CUDAContextScope scope(this);
701
702 size_t dsize = datatype_size(mem.data_type);
703 size_t size = mem.memory_size();
704
705 CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP;
706 switch (mem.info.extension) {
707 case EXTENSION_REPEAT:
708 address_mode = CU_TR_ADDRESS_MODE_WRAP;
709 break;
710 case EXTENSION_EXTEND:
711 address_mode = CU_TR_ADDRESS_MODE_CLAMP;
712 break;
713 case EXTENSION_CLIP:
714 address_mode = CU_TR_ADDRESS_MODE_BORDER;
715 break;
716 case EXTENSION_MIRROR:
717 address_mode = CU_TR_ADDRESS_MODE_MIRROR;
718 break;
719 default:
720 assert(0);
721 break;
722 }
723
724 CUfilter_mode filter_mode;
726 filter_mode = CU_TR_FILTER_MODE_POINT;
727 }
728 else {
729 filter_mode = CU_TR_FILTER_MODE_LINEAR;
730 }
731
732 /* Image Texture Storage */
733 /* Cycles expects to read all texture data as normalized float values in
734 * kernel/device/gpu/image.h. But storing all data as floats would be very inefficient due to the
735 * huge size of float textures. So in the code below, we define different texture types including
736 * integer types, with the aim of using CUDA's default promotion behavior of integer data to
737 * floating point data in the range [0, 1], as noted in the CUDA documentation on
738 * cuTexObjectCreate API Call.
739 * Note that 32-bit integers are not supported by this promotion behavior and cannot be used
740 * with Cycles's current implementation in kernel/device/gpu/image.h.
741 */
742 CUarray_format_enum format;
743 switch (mem.data_type) {
744 case TYPE_UCHAR:
745 format = CU_AD_FORMAT_UNSIGNED_INT8;
746 break;
747 case TYPE_UINT16:
748 format = CU_AD_FORMAT_UNSIGNED_INT16;
749 break;
750 case TYPE_FLOAT:
751 format = CU_AD_FORMAT_FLOAT;
752 break;
753 case TYPE_HALF:
754 format = CU_AD_FORMAT_HALF;
755 break;
756 default:
757 assert(0);
758 return;
759 }
760
761 Mem *cmem = NULL;
762 CUarray array_3d = NULL;
763 size_t src_pitch = mem.data_width * dsize * mem.data_elements;
764 size_t dst_pitch = src_pitch;
765
766 if (!mem.is_resident(this)) {
767 thread_scoped_lock lock(device_mem_map_mutex);
768 cmem = &device_mem_map[&mem];
769 cmem->texobject = 0;
770
771 if (mem.data_depth > 1) {
772 array_3d = (CUarray)mem.device_pointer;
773 cmem->array = reinterpret_cast<arrayMemObject>(array_3d);
774 }
775 else if (mem.data_height > 0) {
776 dst_pitch = align_up(src_pitch, pitch_alignment);
777 }
778 }
779 else if (mem.data_depth > 1) {
780 /* 3D texture using array, there is no API for linear memory. */
781 CUDA_ARRAY3D_DESCRIPTOR desc;
782
783 desc.Width = mem.data_width;
784 desc.Height = mem.data_height;
785 desc.Depth = mem.data_depth;
786 desc.Format = format;
787 desc.NumChannels = mem.data_elements;
788 desc.Flags = 0;
789
790 VLOG_WORK << "Array 3D allocate: " << mem.name << ", "
791 << string_human_readable_number(mem.memory_size()) << " bytes. ("
793
794 cuda_assert(cuArray3DCreate(&array_3d, &desc));
795
796 if (!array_3d) {
797 return;
798 }
799
800 CUDA_MEMCPY3D param;
801 memset(&param, 0, sizeof(param));
802 param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
803 param.dstArray = array_3d;
804 param.srcMemoryType = CU_MEMORYTYPE_HOST;
805 param.srcHost = mem.host_pointer;
806 param.srcPitch = src_pitch;
807 param.WidthInBytes = param.srcPitch;
808 param.Height = mem.data_height;
809 param.Depth = mem.data_depth;
810
811 cuda_assert(cuMemcpy3D(&param));
812
813 mem.device_pointer = (device_ptr)array_3d;
814 mem.device_size = size;
815 stats.mem_alloc(size);
816
817 thread_scoped_lock lock(device_mem_map_mutex);
818 cmem = &device_mem_map[&mem];
819 cmem->texobject = 0;
820 cmem->array = reinterpret_cast<arrayMemObject>(array_3d);
821 }
822 else if (mem.data_height > 0) {
823 /* 2D texture, using pitch aligned linear memory. */
824 dst_pitch = align_up(src_pitch, pitch_alignment);
825 size_t dst_size = dst_pitch * mem.data_height;
826
827 cmem = generic_alloc(mem, dst_size - mem.memory_size());
828 if (!cmem) {
829 return;
830 }
831
832 CUDA_MEMCPY2D param;
833 memset(&param, 0, sizeof(param));
834 param.dstMemoryType = CU_MEMORYTYPE_DEVICE;
835 param.dstDevice = mem.device_pointer;
836 param.dstPitch = dst_pitch;
837 param.srcMemoryType = CU_MEMORYTYPE_HOST;
838 param.srcHost = mem.host_pointer;
839 param.srcPitch = src_pitch;
840 param.WidthInBytes = param.srcPitch;
841 param.Height = mem.data_height;
842
843 cuda_assert(cuMemcpy2DUnaligned(&param));
844 }
845 else {
846 /* 1D texture, using linear memory. */
847 cmem = generic_alloc(mem);
848 if (!cmem) {
849 return;
850 }
851
852 cuda_assert(cuMemcpyHtoD(mem.device_pointer, mem.host_pointer, size));
853 }
854
855 /* Resize once */
856 const uint slot = mem.slot;
857 if (slot >= texture_info.size()) {
858 /* Allocate some slots in advance, to reduce amount
859 * of re-allocations. */
860 texture_info.resize(slot + 128);
861 }
862
863 /* Set Mapping and tag that we need to (re-)upload to device */
864 texture_info[slot] = mem.info;
865 need_texture_info = true;
866
871 {
872 CUDA_RESOURCE_DESC resDesc;
873 memset(&resDesc, 0, sizeof(resDesc));
874
875 if (array_3d) {
876 resDesc.resType = CU_RESOURCE_TYPE_ARRAY;
877 resDesc.res.array.hArray = array_3d;
878 resDesc.flags = 0;
879 }
880 else if (mem.data_height > 0) {
881 resDesc.resType = CU_RESOURCE_TYPE_PITCH2D;
882 resDesc.res.pitch2D.devPtr = mem.device_pointer;
883 resDesc.res.pitch2D.format = format;
884 resDesc.res.pitch2D.numChannels = mem.data_elements;
885 resDesc.res.pitch2D.height = mem.data_height;
886 resDesc.res.pitch2D.width = mem.data_width;
887 resDesc.res.pitch2D.pitchInBytes = dst_pitch;
888 }
889 else {
890 resDesc.resType = CU_RESOURCE_TYPE_LINEAR;
891 resDesc.res.linear.devPtr = mem.device_pointer;
892 resDesc.res.linear.format = format;
893 resDesc.res.linear.numChannels = mem.data_elements;
894 resDesc.res.linear.sizeInBytes = mem.device_size;
895 }
896
897 CUDA_TEXTURE_DESC texDesc;
898 memset(&texDesc, 0, sizeof(texDesc));
899 texDesc.addressMode[0] = address_mode;
900 texDesc.addressMode[1] = address_mode;
901 texDesc.addressMode[2] = address_mode;
902 texDesc.filterMode = filter_mode;
903 /* CUDA's flag CU_TRSF_READ_AS_INTEGER is intentionally not used and it is
904 * significant, see above an explanation about how Blender treat textures. */
905 texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;
906
907 thread_scoped_lock lock(device_mem_map_mutex);
908 cmem = &device_mem_map[&mem];
909
910 cuda_assert(cuTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL));
911
912 texture_info[slot].data = (uint64_t)cmem->texobject;
913 }
914 else {
915 texture_info[slot].data = (uint64_t)mem.device_pointer;
916 }
917}
918
919void CUDADevice::tex_free(device_texture &mem)
920{
921 if (mem.device_pointer) {
922 CUDAContextScope scope(this);
923 thread_scoped_lock lock(device_mem_map_mutex);
924 DCHECK(device_mem_map.find(&mem) != device_mem_map.end());
925 const Mem &cmem = device_mem_map[&mem];
926
927 if (cmem.texobject) {
928 /* Free bindless texture. */
929 cuTexObjectDestroy(cmem.texobject);
930 }
931
932 if (!mem.is_resident(this)) {
933 /* Do not free memory here, since it was allocated on a different device. */
934 device_mem_map.erase(device_mem_map.find(&mem));
935 }
936 else if (cmem.array) {
937 /* Free array. */
938 cuArrayDestroy(reinterpret_cast<CUarray>(cmem.array));
939 stats.mem_free(mem.device_size);
940 mem.device_pointer = 0;
941 mem.device_size = 0;
942
943 device_mem_map.erase(device_mem_map.find(&mem));
944 }
945 else {
946 lock.unlock();
947 generic_free(mem);
948 }
949 }
950}
951
952unique_ptr<DeviceQueue> CUDADevice::gpu_queue_create()
953{
954 return make_unique<CUDADeviceQueue>(this);
955}
956
957bool CUDADevice::should_use_graphics_interop()
958{
959 /* Check whether this device is part of OpenGL context.
960 *
961 * Using CUDA device for graphics interoperability which is not part of the OpenGL context is
962 * possible, but from the empiric measurements it can be considerably slower than using naive
963 * pixels copy. */
964
965 if (headless) {
966 /* Avoid any call which might involve interaction with a graphics backend when we know that
967 * we don't have active graphics context. This avoid crash on certain platforms when calling
968 * cuGLGetDevices(). */
969 return false;
970 }
971
972 CUDAContextScope scope(this);
973
974 int num_all_devices = 0;
975 cuda_assert(cuDeviceGetCount(&num_all_devices));
976
977 if (num_all_devices == 0) {
978 return false;
979 }
980
981 vector<CUdevice> gl_devices(num_all_devices);
982 uint num_gl_devices = 0;
983 cuGLGetDevices(&num_gl_devices, gl_devices.data(), num_all_devices, CU_GL_DEVICE_LIST_ALL);
984
985 for (uint i = 0; i < num_gl_devices; ++i) {
986 if (gl_devices[i] == cuDevice) {
987 return true;
988 }
989 }
990
991 return false;
992}
993
994int CUDADevice::get_num_multiprocessors()
995{
996 return get_device_default_attribute(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, 0);
997}
998
999int CUDADevice::get_max_num_threads_per_multiprocessor()
1000{
1001 return get_device_default_attribute(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, 0);
1002}
1003
1004bool CUDADevice::get_device_attribute(CUdevice_attribute attribute, int *value)
1005{
1006 CUDAContextScope scope(this);
1007
1008 return cuDeviceGetAttribute(value, attribute, cuDevice) == CUDA_SUCCESS;
1009}
1010
1011int CUDADevice::get_device_default_attribute(CUdevice_attribute attribute, int default_value)
1012{
1013 int value = 0;
1014 if (!get_device_attribute(attribute, &value)) {
1015 return default_value;
1016 }
1017 return value;
1018}
1019
1021
1022#endif
void BLI_kdtree_nd_ free(KDTree *tree)
unsigned int uint
in reality light always falls off quadratically Particle Retrieve the data of the particle that spawned the object for example to give variation to multiple instances of an object Point Retrieve information about points in a point cloud Retrieve the edges of an object as it appears to Cycles topology will always appear triangulated Convert a blackbody temperature to an RGB value Normal Generate a perturbed normal from an RGB normal map image Typically used for faking highly detailed surfaces Generate an OSL shader from a file or text data block Image Sample an image file as a texture Gabor Generate Gabor noise Gradient Generate interpolated color and intensity values based on the input vector Magic Generate a psychedelic color texture Voronoi Generate Worley noise based on the distance to random points Typically used to generate textures such as or biological cells Brick Generate a procedural texture producing bricks Texture Retrieve multiple types of texture coordinates nTypically used as inputs for texture nodes Vector Convert a or normal between and object coordinate space Combine Create a color from its and value channels Color Retrieve a color attribute
volatile int lock
static DBVT_INLINE btScalar size(const btDbvtVolume &a)
Definition btDbvt.cpp:52
SIMD_FORCE_INLINE const btScalar & w() const
Return the w value.
Definition btQuadWord.h:119
CUDA cuda
Definition debug.h:120
DeviceType type
virtual void set_error(const string &error)
DeviceInfo info
void mem_free(size_t size)
Definition util/stats.h:26
void mem_alloc(size_t size)
Definition util/stats.h:20
bool is_resident(Device *sub_device) const
Definition memory.cpp:127
size_t memory_elements_size(int elements)
#define printf
static constexpr size_t datatype_size(DataType datatype)
@ MEM_TEXTURE
@ TYPE_UINT16
#define KERNEL_DATA_ARRAY(type, name)
Definition data_arrays.h:6
DebugFlags & DebugFlags()
Definition debug.h:142
unsigned long long CUtexObject
#define CCL_NAMESPACE_END
@ DEVICE_CUDA
@ DEVICE_OPTIX
#define NULL
static const char * to_string(const Interpolation &interp)
Definition gl_shader.cc:82
KernelData
@ BVH_LAYOUT_BVH2
#define KERNEL_FEATURE_NODE_RAYTRACE
#define KERNEL_FEATURE_MNEE
DeviceKernel
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE
format
#define VLOG_INFO
Definition log.h:72
#define DCHECK(expression)
Definition log.h:51
#define VLOG_WORK
Definition log.h:75
string util_md5_string(const string &str)
Definition md5.cpp:373
static void error(const char *str)
ThreadQueue * queue
all scheduled work for the cpu
int BVHLayoutMask
Definition params.h:51
string path_cache_get(const string &sub)
Definition path.cpp:362
string path_get(const string &sub)
Definition path.cpp:339
string path_files_md5_hash(const string &dir)
Definition path.cpp:612
string path_join(const string &dir, const string &file)
Definition path.cpp:417
bool path_exists(const string &path)
Definition path.cpp:565
void path_create_directories(const string &filepath)
Definition path.cpp:648
bool path_read_compressed_text(const string &path, string &text)
Definition path.cpp:754
unsigned __int64 uint64_t
Definition stdint.h:90
string string_human_readable_size(size_t size)
Definition string.cpp:245
string string_human_readable_number(size_t num)
Definition string.cpp:266
CCL_NAMESPACE_BEGIN string string_printf(const char *format,...)
Definition string.cpp:23
bool adaptive_compile
Definition debug.h:60
uint interpolation
int system_cpu_bits()
Definition system.cpp:137
std::unique_lock< std::mutex > thread_scoped_lock
Definition thread.h:30
CCL_NAMESPACE_BEGIN double time_dt()
Definition time.cpp:36
@ IMAGE_DATA_TYPE_NANOVDB_FP16
@ IMAGE_DATA_TYPE_NANOVDB_FLOAT
@ IMAGE_DATA_TYPE_NANOVDB_FLOAT3
@ IMAGE_DATA_TYPE_NANOVDB_FPN
@ INTERPOLATION_CLOSEST
@ EXTENSION_REPEAT
@ EXTENSION_CLIP
@ EXTENSION_EXTEND
@ EXTENSION_MIRROR
ccl_device_inline size_t align_up(size_t offset, size_t alignment)
Definition util/types.h:48
uint64_t device_ptr
Definition util/types.h:45