50 # ifndef WITH_CUDA_DYNLOAD
58 const char *cuewErrorString(CUresult
result)
71 const char *cuewCompilerPath()
73 return CYCLES_CUDA_NVCC_EXECUTABLE;
76 int cuewCompilerVersion()
78 return (CUDA_VERSION / 100) + (CUDA_VERSION % 100 / 10);
90 explicit CUDASplitKernel(CUDADevice *device);
96 int num_global_elements,
112 class CUDAContextScope {
114 CUDAContextScope(CUDADevice *device);
121 bool CUDADevice::have_precompiled_kernels()
123 string cubins_path =
path_get(
"lib");
127 bool CUDADevice::show_samples()
const
138 void CUDADevice::set_error(
const string &
error)
143 fprintf(stderr,
"\nRefer to the Cycles GPU rendering documentation for possible solutions:\n");
145 "https://docs.blender.org/manual/en/latest/render/cycles/gpu_rendering.html\n\n");
150 # define cuda_assert(stmt) \
152 CUresult result = stmt; \
153 if (result != CUDA_SUCCESS) { \
154 const char *name = cuewErrorString(result); \
155 set_error(string_printf("%s in %s (device_cuda_impl.cpp:%d)", name, #stmt, __LINE__)); \
161 :
Device(info, stats, profiler, background_), texture_info(this,
"__texture_info",
MEM_GLOBAL)
164 background = background_;
175 need_texture_info =
false;
177 device_texture_headroom = 0;
178 device_working_headroom = 0;
179 move_texture_to_host =
false;
185 functions.loaded =
false;
188 CUresult
result = cuInit(0);
189 if (
result != CUDA_SUCCESS) {
190 set_error(
string_printf(
"Failed to initialize CUDA runtime (%s)", cuewErrorString(
result)));
195 result = cuDeviceGet(&cuDevice, cuDevId);
196 if (
result != CUDA_SUCCESS) {
197 set_error(
string_printf(
"Failed to get CUDA device handle from ordinal (%s)",
198 cuewErrorString(
result)));
206 cuDeviceGetAttribute(&can_map_host, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice));
208 cuda_assert(cuDeviceGetAttribute(
209 &pitch_alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuDevice));
211 unsigned int ctx_flags = CU_CTX_LMEM_RESIZE_TO_MAX;
213 ctx_flags |= CU_CTX_MAP_HOST;
219 result = cuCtxCreate(&cuContext, ctx_flags, cuDevice);
222 result = cuGLCtxCreate(&cuContext, ctx_flags, cuDevice);
224 if (
result != CUDA_SUCCESS) {
225 result = cuCtxCreate(&cuContext, ctx_flags, cuDevice);
230 if (
result != CUDA_SUCCESS) {
236 cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
237 cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
238 cuDevArchitecture = major * 100 + minor * 10;
241 cuCtxPopCurrent(
NULL);
244 CUDADevice::~CUDADevice()
252 cuda_assert(cuCtxDestroy(cuContext));
258 cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
259 cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
264 "CUDA backend requires compute capability 3.0 or up, but found %d.%d.", major, minor));
271 bool CUDADevice::check_peer_access(
Device *peer_device)
273 if (peer_device ==
this) {
280 CUDADevice *
const peer_device_cuda =
static_cast<CUDADevice *
>(peer_device);
283 cuda_assert(cuDeviceCanAccessPeer(&can_access, cuDevice, peer_device_cuda->cuDevice));
284 if (can_access == 0) {
289 cuda_assert(cuDeviceGetP2PAttribute(&can_access,
290 CU_DEVICE_P2P_ATTRIBUTE_ARRAY_ACCESS_ACCESS_SUPPORTED,
292 peer_device_cuda->cuDevice));
293 if (can_access == 0) {
299 const CUDAContextScope scope(
this);
300 CUresult
result = cuCtxEnablePeerAccess(peer_device_cuda->cuContext, 0);
301 if (
result != CUDA_SUCCESS) {
302 set_error(
string_printf(
"Failed to enable peer access on CUDA context (%s)",
303 cuewErrorString(
result)));
308 const CUDAContextScope scope(peer_device_cuda);
309 CUresult
result = cuCtxEnablePeerAccess(cuContext, 0);
310 if (
result != CUDA_SUCCESS) {
311 set_error(
string_printf(
"Failed to enable peer access on CUDA context (%s)",
312 cuewErrorString(
result)));
320 bool CUDADevice::use_adaptive_compilation()
325 bool CUDADevice::use_split_kernel()
333 string CUDADevice::compile_kernel_get_common_cflags(
337 const string source_path =
path_get(
"source");
338 const string include_path = source_path;
341 "--ptxas-options=\"-v\" "
346 include_path.c_str());
347 if (!
filter && use_adaptive_compilation()) {
350 const char *extra_cflags = getenv(
"CYCLES_CUDA_EXTRA_CFLAGS");
352 cflags += string(
" ") + string(extra_cflags);
354 # ifdef WITH_CYCLES_DEBUG
355 cflags +=
" -D__KERNEL_DEBUG__";
359 cflags +=
" -D__SPLIT__";
363 cflags +=
" -DWITH_NANOVDB";
376 cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
377 cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
380 if (!use_adaptive_compilation()) {
383 VLOG(1) <<
"Testing for pre-compiled kernel " << cubin <<
".";
385 VLOG(1) <<
"Using precompiled kernel.";
391 int ptx_major = major, ptx_minor = minor;
392 while (ptx_major >= 3) {
394 string_printf(
"lib/%s_compute_%d%d.ptx", name, ptx_major, ptx_minor));
395 VLOG(1) <<
"Testing for pre-compiled kernel " << ptx <<
".";
397 VLOG(1) <<
"Using precompiled kernel.";
412 string source_path =
path_get(
"source");
418 string common_cflags = compile_kernel_get_common_cflags(
419 requested_features, strstr(name,
"filter") !=
NULL, strstr(name,
"split") !=
NULL);
422 const char *
const kernel_ext = force_ptx ?
"ptx" :
"cubin";
423 const char *
const kernel_arch = force_ptx ?
"compute" :
"sm";
425 "cycles_%s_%s_%d%d_%s.%s", name, kernel_arch, major, minor, kernel_md5.c_str(), kernel_ext);
427 VLOG(1) <<
"Testing for locally compiled kernel " << cubin <<
".";
429 VLOG(1) <<
"Using locally compiled kernel.";
434 if (!use_adaptive_compilation() && have_precompiled_kernels()) {
437 string_printf(
"CUDA backend requires compute capability 3.0 or up, but found %d.%d. "
438 "Your GPU is not supported.",
444 string_printf(
"CUDA binary kernel for this graphics card compute "
445 "capability (%d.%d) not found.",
454 const char *
const nvcc = cuewCompilerPath();
457 "CUDA nvcc compiler not found. "
458 "Install CUDA toolkit in default location.");
462 const int nvcc_cuda_version = cuewCompilerVersion();
463 VLOG(1) <<
"Found nvcc " << nvcc <<
", CUDA version " << nvcc_cuda_version <<
".";
464 if (nvcc_cuda_version < 101) {
466 "Unsupported CUDA version %d.%d detected, "
467 "you need CUDA 10.1 or newer.\n",
468 nvcc_cuda_version / 10,
469 nvcc_cuda_version % 10);
472 else if (!(nvcc_cuda_version == 101 || nvcc_cuda_version == 102 || nvcc_cuda_version == 111 ||
473 nvcc_cuda_version == 112 || nvcc_cuda_version == 113 || nvcc_cuda_version == 114)) {
475 "CUDA version %d.%d detected, build may succeed but only "
476 "CUDA 10.1 to 11.4 are officially supported.\n",
477 nvcc_cuda_version / 10,
478 nvcc_cuda_version % 10);
501 common_cflags.c_str());
503 printf(
"Compiling CUDA kernel ...\n%s\n", command.c_str());
506 command =
"call " + command;
508 if (system(command.c_str()) != 0) {
510 "Failed to execute compilation command, "
511 "see console for details.");
518 "CUDA kernel compilation failed, "
519 "see console for details.");
523 printf(
"Kernel compilation finished in %.2lfs.\n",
time_dt() - starttime);
535 if (cuFilterModule && cuModule) {
536 VLOG(1) <<
"Skipping kernel reload, not currently supported.";
545 if (!support_device(requested_features))
549 const char *kernel_name = use_split_kernel() ?
"kernel_split" :
"kernel";
550 string cubin = compile_kernel(requested_features, kernel_name);
554 const char *filter_name =
"filter";
555 string filter_cubin = compile_kernel(requested_features, filter_name);
556 if (filter_cubin.empty())
560 CUDAContextScope scope(
this);
566 result = cuModuleLoadData(&cuModule, cubin_data.c_str());
568 result = CUDA_ERROR_FILE_NOT_FOUND;
570 if (
result != CUDA_SUCCESS)
572 "Failed to load CUDA kernel from '%s' (%s)", cubin.c_str(), cuewErrorString(
result)));
575 result = cuModuleLoadData(&cuFilterModule, cubin_data.c_str());
577 result = CUDA_ERROR_FILE_NOT_FOUND;
579 if (
result != CUDA_SUCCESS)
580 set_error(
string_printf(
"Failed to load CUDA kernel from '%s' (%s)",
581 filter_cubin.c_str(),
582 cuewErrorString(
result)));
584 if (
result == CUDA_SUCCESS) {
585 reserve_local_memory(requested_features);
590 return (
result == CUDA_SUCCESS);
593 void CUDADevice::load_functions()
596 if (functions.loaded) {
599 functions.loaded =
true;
601 cuda_assert(cuModuleGetFunction(
602 &functions.adaptive_stopping, cuModule,
"kernel_cuda_adaptive_stopping"));
603 cuda_assert(cuModuleGetFunction(
604 &functions.adaptive_filter_x, cuModule,
"kernel_cuda_adaptive_filter_x"));
605 cuda_assert(cuModuleGetFunction(
606 &functions.adaptive_filter_y, cuModule,
"kernel_cuda_adaptive_filter_y"));
607 cuda_assert(cuModuleGetFunction(
608 &functions.adaptive_scale_samples, cuModule,
"kernel_cuda_adaptive_scale_samples"));
610 cuda_assert(cuFuncSetCacheConfig(functions.adaptive_stopping, CU_FUNC_CACHE_PREFER_L1));
611 cuda_assert(cuFuncSetCacheConfig(functions.adaptive_filter_x, CU_FUNC_CACHE_PREFER_L1));
612 cuda_assert(cuFuncSetCacheConfig(functions.adaptive_filter_y, CU_FUNC_CACHE_PREFER_L1));
613 cuda_assert(cuFuncSetCacheConfig(functions.adaptive_scale_samples, CU_FUNC_CACHE_PREFER_L1));
615 int unused_min_blocks;
616 cuda_assert(cuOccupancyMaxPotentialBlockSize(&unused_min_blocks,
617 &functions.adaptive_num_threads_per_block,
618 functions.adaptive_scale_samples,
626 if (use_split_kernel()) {
635 CUDAContextScope scope(
this);
637 size_t total = 0, free_before = 0, free_after = 0;
638 cuMemGetInfo(&free_before, &total);
644 cuda_assert(cuModuleGetFunction(&cuRender, cuModule,
"kernel_cuda_bake"));
647 cuda_assert(cuModuleGetFunction(&cuRender, cuModule,
"kernel_cuda_branched_path_trace"));
650 cuda_assert(cuModuleGetFunction(&cuRender, cuModule,
"kernel_cuda_path_trace"));
653 cuda_assert(cuFuncSetCacheConfig(cuRender, CU_FUNC_CACHE_PREFER_L1));
655 int min_blocks, num_threads_per_block;
657 cuOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, cuRender,
NULL, 0, 0));
662 CUdeviceptr d_work_tiles = 0;
663 uint total_work_size = 0;
665 void *args[] = {&d_work_tiles, &total_work_size};
667 cuda_assert(cuLaunchKernel(cuRender, 1, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0));
669 cuda_assert(cuCtxSynchronize());
671 cuMemGetInfo(&free_after, &total);
677 const size_t keep_mb = 1024;
679 while (free_after > keep_mb * 1024 * 1024LL) {
681 cuda_assert(cuMemAlloc(&tmp, 10 * 1024 * 1024LL));
682 cuMemGetInfo(&free_after, &total);
687 void CUDADevice::init_host_memory()
692 size_t default_limit = 4 * 1024 * 1024 * 1024LL;
695 if (system_ram > 0) {
696 if (system_ram / 2 > default_limit) {
697 map_host_limit = system_ram - default_limit;
700 map_host_limit = system_ram / 2;
704 VLOG(1) <<
"Mapped host memory disabled, failed to get system RAM";
712 device_working_headroom = 32 * 1024 * 1024LL;
713 device_texture_headroom = 128 * 1024 * 1024LL;
719 void CUDADevice::load_texture_info()
721 if (need_texture_info) {
724 need_texture_info =
false;
725 texture_info.copy_to_device();
729 void CUDADevice::move_textures_to_host(
size_t size,
bool for_texture)
732 static bool any_device_moving_textures_to_host =
false;
733 if (any_device_moving_textures_to_host) {
738 move_texture_to_host =
true;
744 bool max_is_image =
false;
747 foreach (CUDAMemMap::value_type &pair, cuda_mem_map) {
749 CUDAMem *cmem = &pair.second;
753 if (!mem.
is_resident(
this) || cmem->use_mapped_host) {
758 (&mem != &texture_info);
759 bool is_image = is_texture && (mem.
data_height > 1);
762 if (!is_texture || cmem->array) {
767 if (for_texture && !is_image) {
772 if (is_image > max_is_image || (is_image == max_is_image && mem.
device_size > max_size)) {
773 max_is_image = is_image;
784 VLOG(1) <<
"Move memory from device to host: " << max_mem->
name;
789 any_device_moving_textures_to_host =
true;
801 any_device_moving_textures_to_host =
false;
809 move_texture_to_host =
false;
815 CUDADevice::CUDAMem *CUDADevice::generic_alloc(
device_memory &mem,
size_t pitch_padding)
817 CUDAContextScope scope(
this);
819 CUdeviceptr device_pointer = 0;
822 CUresult mem_alloc_result = CUDA_ERROR_OUT_OF_MEMORY;
823 const char *status =
"";
833 bool is_image = is_texture && (mem.
data_height > 1);
835 size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom;
837 size_t total = 0,
free = 0;
838 cuMemGetInfo(&
free, &total);
841 if (!move_texture_to_host && !is_image && (
size + headroom) >=
free && can_map_host) {
842 move_textures_to_host(
size + headroom -
free, is_texture);
843 cuMemGetInfo(&
free, &total);
847 if (!move_texture_to_host && (
size + headroom) <
free) {
848 mem_alloc_result = cuMemAlloc(&device_pointer,
size);
849 if (mem_alloc_result == CUDA_SUCCESS) {
850 status =
" in device memory";
856 void *shared_pointer = 0;
861 mem_alloc_result = CUDA_SUCCESS;
864 else if (map_host_used +
size < map_host_limit) {
866 mem_alloc_result = cuMemHostAlloc(
867 &shared_pointer,
size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED);
869 assert((mem_alloc_result == CUDA_SUCCESS && shared_pointer != 0) ||
870 (mem_alloc_result != CUDA_SUCCESS && shared_pointer == 0));
873 if (mem_alloc_result == CUDA_SUCCESS) {
874 cuda_assert(cuMemHostGetDevicePointer_v2(&device_pointer, shared_pointer, 0));
875 map_host_used +=
size;
876 status =
" in host memory";
880 if (mem_alloc_result != CUDA_SUCCESS) {
882 status =
" failed, out of device memory";
883 set_error(
"System is out of GPU memory");
886 status =
" failed, out of device and host memory";
887 set_error(
"System is out of GPU and shared host memory");
892 VLOG(1) <<
"Buffer allocate: " << mem.
name <<
", "
907 CUDAMem *cmem = &cuda_mem_map[&mem];
908 if (shared_pointer != 0) {
914 if (!move_texture_to_host && pitch_padding == 0 && mem.
host_pointer &&
930 cmem->use_mapped_host =
true;
933 cmem->use_mapped_host =
false;
950 const CUDAContextScope scope(
this);
959 CUDAContextScope scope(
this);
961 const CUDAMem &cmem = cuda_mem_map[&mem];
966 if (cmem.use_mapped_host) {
989 cuda_mem_map.erase(cuda_mem_map.find(&mem));
999 assert(!
"mem_alloc not supported for textures.");
1002 assert(!
"mem_alloc not supported for global memory.");
1012 assert(!
"mem_copy_to not supported for pixels.");
1026 generic_copy_to(mem);
1030 void CUDADevice::mem_copy_from(
device_memory &mem,
int y,
int w,
int h,
int elem)
1033 pixels_copy_from(mem,
y,
w, h);
1036 assert(!
"mem_copy_from not supported for textures.");
1039 const size_t size = elem *
w * h;
1040 const size_t offset = elem *
y *
w;
1043 const CUDAContextScope scope(
this);
1044 cuda_assert(cuMemcpyDtoH(
1066 const CUDAContextScope scope(
this);
1095 void CUDADevice::const_copy_to(
const char *name,
void *host,
size_t size)
1097 CUDAContextScope scope(
this);
1101 cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, name));
1103 cuda_assert(cuMemcpyHtoD(mem, host,
size));
1110 generic_copy_to(mem);
1125 CUDAContextScope scope(
this);
1128 string bind_name = mem.
name;
1132 CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP;
1135 address_mode = CU_TR_ADDRESS_MODE_WRAP;
1138 address_mode = CU_TR_ADDRESS_MODE_CLAMP;
1141 address_mode = CU_TR_ADDRESS_MODE_BORDER;
1148 CUfilter_mode filter_mode;
1150 filter_mode = CU_TR_FILTER_MODE_POINT;
1153 filter_mode = CU_TR_FILTER_MODE_LINEAR;
1157 CUarray_format_enum
format;
1160 format = CU_AD_FORMAT_UNSIGNED_INT8;
1163 format = CU_AD_FORMAT_UNSIGNED_INT16;
1166 format = CU_AD_FORMAT_UNSIGNED_INT32;
1169 format = CU_AD_FORMAT_SIGNED_INT32;
1172 format = CU_AD_FORMAT_FLOAT;
1175 format = CU_AD_FORMAT_HALF;
1182 CUDAMem *cmem =
NULL;
1183 CUarray array_3d =
NULL;
1185 size_t dst_pitch = src_pitch;
1189 cmem = &cuda_mem_map[&mem];
1190 cmem->texobject = 0;
1194 cmem->array = array_3d;
1197 dst_pitch =
align_up(src_pitch, pitch_alignment);
1202 CUDA_ARRAY3D_DESCRIPTOR desc;
1211 VLOG(1) <<
"Array 3D allocate: " << mem.
name <<
", "
1215 cuda_assert(cuArray3DCreate(&array_3d, &desc));
1221 CUDA_MEMCPY3D param;
1222 memset(¶m, 0,
sizeof(param));
1223 param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
1224 param.dstArray = array_3d;
1225 param.srcMemoryType = CU_MEMORYTYPE_HOST;
1227 param.srcPitch = src_pitch;
1228 param.WidthInBytes = param.srcPitch;
1232 cuda_assert(cuMemcpy3D(¶m));
1239 cmem = &cuda_mem_map[&mem];
1240 cmem->texobject = 0;
1241 cmem->array = array_3d;
1245 dst_pitch =
align_up(src_pitch, pitch_alignment);
1248 cmem = generic_alloc(mem, dst_size - mem.
memory_size());
1253 CUDA_MEMCPY2D param;
1254 memset(¶m, 0,
sizeof(param));
1255 param.dstMemoryType = CU_MEMORYTYPE_DEVICE;
1257 param.dstPitch = dst_pitch;
1258 param.srcMemoryType = CU_MEMORYTYPE_HOST;
1260 param.srcPitch = src_pitch;
1261 param.WidthInBytes = param.srcPitch;
1264 cuda_assert(cuMemcpy2DUnaligned(¶m));
1268 cmem = generic_alloc(mem);
1278 if (slot >= texture_info.size()) {
1281 texture_info.resize(slot + 128);
1285 texture_info[slot] = mem.
info;
1286 need_texture_info =
true;
1291 CUDA_RESOURCE_DESC resDesc;
1292 memset(&resDesc, 0,
sizeof(resDesc));
1295 resDesc.resType = CU_RESOURCE_TYPE_ARRAY;
1296 resDesc.res.array.hArray = array_3d;
1300 resDesc.resType = CU_RESOURCE_TYPE_PITCH2D;
1302 resDesc.res.pitch2D.format =
format;
1306 resDesc.res.pitch2D.pitchInBytes = dst_pitch;
1309 resDesc.resType = CU_RESOURCE_TYPE_LINEAR;
1311 resDesc.res.linear.format =
format;
1316 CUDA_TEXTURE_DESC texDesc;
1317 memset(&texDesc, 0,
sizeof(texDesc));
1318 texDesc.addressMode[0] = address_mode;
1319 texDesc.addressMode[1] = address_mode;
1320 texDesc.addressMode[2] = address_mode;
1321 texDesc.filterMode = filter_mode;
1322 texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;
1325 cmem = &cuda_mem_map[&mem];
1327 cuda_assert(cuTexObjectCreate(&cmem->texobject, &resDesc, &texDesc,
NULL));
1329 texture_info[slot].data = (
uint64_t)cmem->texobject;
1339 CUDAContextScope scope(
this);
1341 const CUDAMem &cmem = cuda_mem_map[&mem];
1343 if (cmem.texobject) {
1345 cuTexObjectDestroy(cmem.texobject);
1350 cuda_mem_map.erase(cuda_mem_map.find(&mem));
1352 else if (cmem.array) {
1354 cuArrayDestroy(cmem.array);
1359 cuda_mem_map.erase(cuda_mem_map.find(&mem));
1368 # define CUDA_GET_BLOCKSIZE(func, w, h) \
1369 int threads_per_block; \
1371 cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
1372 int threads = (int)sqrt((float)threads_per_block); \
1373 int xblocks = ((w) + threads - 1) / threads; \
1374 int yblocks = ((h) + threads - 1) / threads;
1376 # define CUDA_LAUNCH_KERNEL(func, args) \
1377 cuda_assert(cuLaunchKernel(func, xblocks, yblocks, 1, threads, threads, 1, 0, 0, args, 0));
1380 # define CUDA_GET_BLOCKSIZE_1D(func, w, h) \
1381 int threads_per_block; \
1383 cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
1384 int xblocks = ((w) + threads_per_block - 1) / threads_per_block; \
1387 # define CUDA_LAUNCH_KERNEL_1D(func, args) \
1388 cuda_assert(cuLaunchKernel(func, xblocks, yblocks, 1, threads_per_block, 1, 1, 0, 0, args, 0));
1390 bool CUDADevice::denoising_non_local_means(
device_ptr image_ptr,
1399 CUDAContextScope scope(
this);
1402 int w =
task->buffer.width;
1403 int h =
task->buffer.h;
1404 int r =
task->nlm_state.r;
1405 int f =
task->nlm_state.f;
1406 float a =
task->nlm_state.a;
1407 float k_2 =
task->nlm_state.k_2;
1409 int pass_stride =
task->buffer.pass_stride;
1410 int num_shifts = (2 *
r + 1) * (2 *
r + 1);
1411 int channel_offset =
task->nlm_state.is_color ?
task->buffer.pass_stride : 0;
1412 int frame_offset = 0;
1417 CUdeviceptr difference = (CUdeviceptr)
task->buffer.temporary_mem.device_pointer;
1418 CUdeviceptr blurDifference = difference +
sizeof(
float) * pass_stride * num_shifts;
1419 CUdeviceptr weightAccum = difference + 2 *
sizeof(
float) * pass_stride * num_shifts;
1420 CUdeviceptr scale_ptr = 0;
1422 cuda_assert(cuMemsetD8(weightAccum, 0,
sizeof(
float) * pass_stride));
1423 cuda_assert(cuMemsetD8(out_ptr, 0,
sizeof(
float) * pass_stride));
1426 CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMUpdateOutput;
1427 cuda_assert(cuModuleGetFunction(
1428 &cuNLMCalcDifference, cuFilterModule,
"kernel_cuda_filter_nlm_calc_difference"));
1429 cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule,
"kernel_cuda_filter_nlm_blur"));
1430 cuda_assert(cuModuleGetFunction(
1431 &cuNLMCalcWeight, cuFilterModule,
"kernel_cuda_filter_nlm_calc_weight"));
1432 cuda_assert(cuModuleGetFunction(
1433 &cuNLMUpdateOutput, cuFilterModule,
"kernel_cuda_filter_nlm_update_output"));
1435 cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
1436 cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
1437 cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
1438 cuda_assert(cuFuncSetCacheConfig(cuNLMUpdateOutput, CU_FUNC_CACHE_PREFER_L1));
1440 CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference,
w * h, num_shifts);
1442 void *calc_difference_args[] = {&guide_ptr,
1455 void *blur_args[] = {&difference, &blurDifference, &
w, &h, &
stride, &pass_stride, &
r, &f};
1456 void *calc_weight_args[] = {
1457 &blurDifference, &difference, &
w, &h, &
stride, &pass_stride, &
r, &f};
1458 void *update_output_args[] = {&blurDifference,
1470 CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
1471 CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
1472 CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args);
1473 CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
1474 CUDA_LAUNCH_KERNEL_1D(cuNLMUpdateOutput, update_output_args);
1478 CUfunction cuNLMNormalize;
1480 cuModuleGetFunction(&cuNLMNormalize, cuFilterModule,
"kernel_cuda_filter_nlm_normalize"));
1481 cuda_assert(cuFuncSetCacheConfig(cuNLMNormalize, CU_FUNC_CACHE_PREFER_L1));
1482 void *normalize_args[] = {&out_ptr, &weightAccum, &
w, &h, &
stride};
1483 CUDA_GET_BLOCKSIZE(cuNLMNormalize,
w, h);
1484 CUDA_LAUNCH_KERNEL(cuNLMNormalize, normalize_args);
1485 cuda_assert(cuCtxSynchronize());
1488 return !have_error();
1496 CUDAContextScope scope(
this);
1498 CUfunction cuFilterConstructTransform;
1499 cuda_assert(cuModuleGetFunction(
1500 &cuFilterConstructTransform, cuFilterModule,
"kernel_cuda_filter_construct_transform"));
1501 cuda_assert(cuFuncSetCacheConfig(cuFilterConstructTransform, CU_FUNC_CACHE_PREFER_SHARED));
1502 CUDA_GET_BLOCKSIZE(cuFilterConstructTransform,
task->storage.w,
task->storage.h);
1504 void *args[] = {&
task->buffer.mem.device_pointer,
1505 &
task->tile_info_mem.device_pointer,
1506 &
task->storage.transform.device_pointer,
1507 &
task->storage.rank.device_pointer,
1511 &
task->pca_threshold,
1512 &
task->buffer.pass_stride,
1513 &
task->buffer.frame_stride,
1514 &
task->buffer.use_time};
1515 CUDA_LAUNCH_KERNEL(cuFilterConstructTransform, args);
1516 cuda_assert(cuCtxSynchronize());
1518 return !have_error();
1521 bool CUDADevice::denoising_accumulate(
device_ptr color_ptr,
1530 CUDAContextScope scope(
this);
1532 int r =
task->radius;
1535 float k_2 =
task->nlm_k_2;
1537 int w =
task->reconstruction_state.source_w;
1538 int h =
task->reconstruction_state.source_h;
1540 int frame_offset = frame *
task->buffer.frame_stride;
1541 int t =
task->tile_info->frames[frame];
1543 int pass_stride =
task->buffer.pass_stride;
1544 int num_shifts = (2 *
r + 1) * (2 *
r + 1);
1549 CUdeviceptr difference = (CUdeviceptr)
task->buffer.temporary_mem.device_pointer;
1550 CUdeviceptr blurDifference = difference +
sizeof(
float) * pass_stride * num_shifts;
1552 CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian;
1553 cuda_assert(cuModuleGetFunction(
1554 &cuNLMCalcDifference, cuFilterModule,
"kernel_cuda_filter_nlm_calc_difference"));
1555 cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule,
"kernel_cuda_filter_nlm_blur"));
1557 cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule,
"kernel_cuda_filter_nlm_calc_weight"));
1558 cuda_assert(cuModuleGetFunction(
1559 &cuNLMConstructGramian, cuFilterModule,
"kernel_cuda_filter_nlm_construct_gramian"));
1561 cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
1562 cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
1563 cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
1564 cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED));
1566 CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference,
1567 task->reconstruction_state.source_w *
task->reconstruction_state.source_h,
1570 void *calc_difference_args[] = {&color_ptr,
1571 &color_variance_ptr,
1583 void *blur_args[] = {&difference, &blurDifference, &
w, &h, &
stride, &pass_stride, &
r, &f};
1584 void *calc_weight_args[] = {&blurDifference, &difference, &
w, &h, &
stride, &pass_stride, &
r, &f};
1585 void *construct_gramian_args[] = {&
t,
1587 &
task->buffer.mem.device_pointer,
1588 &
task->storage.transform.device_pointer,
1589 &
task->storage.rank.device_pointer,
1590 &
task->storage.XtWX.device_pointer,
1591 &
task->storage.XtWY.device_pointer,
1592 &
task->reconstruction_state.filter_window,
1600 &
task->buffer.use_time};
1602 CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
1603 CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
1604 CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args);
1605 CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
1606 CUDA_LAUNCH_KERNEL_1D(cuNLMConstructGramian, construct_gramian_args);
1607 cuda_assert(cuCtxSynchronize());
1609 return !have_error();
1614 CUfunction cuFinalize;
1615 cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule,
"kernel_cuda_filter_finalize"));
1616 cuda_assert(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1));
1617 void *finalize_args[] = {&output_ptr,
1618 &
task->storage.rank.device_pointer,
1619 &
task->storage.XtWX.device_pointer,
1620 &
task->storage.XtWY.device_pointer,
1622 &
task->reconstruction_state.buffer_params.x,
1623 &
task->render_buffer.samples};
1625 cuFinalize,
task->reconstruction_state.source_w,
task->reconstruction_state.source_h);
1626 CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args);
1627 cuda_assert(cuCtxSynchronize());
1629 return !have_error();
1632 bool CUDADevice::denoising_combine_halves(
device_ptr a_ptr,
1643 CUDAContextScope scope(
this);
1645 CUfunction cuFilterCombineHalves;
1646 cuda_assert(cuModuleGetFunction(
1647 &cuFilterCombineHalves, cuFilterModule,
"kernel_cuda_filter_combine_halves"));
1648 cuda_assert(cuFuncSetCacheConfig(cuFilterCombineHalves, CU_FUNC_CACHE_PREFER_L1));
1650 cuFilterCombineHalves,
task->rect.z -
task->rect.x,
task->rect.w -
task->rect.y);
1652 void *args[] = {&mean_ptr, &variance_ptr, &a_ptr, &b_ptr, &rect, &
r};
1653 CUDA_LAUNCH_KERNEL(cuFilterCombineHalves, args);
1654 cuda_assert(cuCtxSynchronize());
1656 return !have_error();
1659 bool CUDADevice::denoising_divide_shadow(
device_ptr a_ptr,
1669 CUDAContextScope scope(
this);
1671 CUfunction cuFilterDivideShadow;
1672 cuda_assert(cuModuleGetFunction(
1673 &cuFilterDivideShadow, cuFilterModule,
"kernel_cuda_filter_divide_shadow"));
1674 cuda_assert(cuFuncSetCacheConfig(cuFilterDivideShadow, CU_FUNC_CACHE_PREFER_L1));
1676 cuFilterDivideShadow,
task->rect.z -
task->rect.x,
task->rect.w -
task->rect.y);
1678 void *args[] = {&
task->render_buffer.samples,
1679 &
task->tile_info_mem.device_pointer,
1682 &sample_variance_ptr,
1684 &buffer_variance_ptr,
1686 &
task->render_buffer.pass_stride,
1687 &
task->render_buffer.offset};
1688 CUDA_LAUNCH_KERNEL(cuFilterDivideShadow, args);
1689 cuda_assert(cuCtxSynchronize());
1691 return !have_error();
1694 bool CUDADevice::denoising_get_feature(
int mean_offset,
1695 int variance_offset,
1704 CUDAContextScope scope(
this);
1706 CUfunction cuFilterGetFeature;
1708 cuModuleGetFunction(&cuFilterGetFeature, cuFilterModule,
"kernel_cuda_filter_get_feature"));
1709 cuda_assert(cuFuncSetCacheConfig(cuFilterGetFeature, CU_FUNC_CACHE_PREFER_L1));
1710 CUDA_GET_BLOCKSIZE(cuFilterGetFeature,
task->rect.z -
task->rect.x,
task->rect.w -
task->rect.y);
1712 void *args[] = {&
task->render_buffer.samples,
1713 &
task->tile_info_mem.device_pointer,
1720 &
task->render_buffer.pass_stride,
1721 &
task->render_buffer.offset};
1722 CUDA_LAUNCH_KERNEL(cuFilterGetFeature, args);
1723 cuda_assert(cuCtxSynchronize());
1725 return !have_error();
1728 bool CUDADevice::denoising_write_feature(
int out_offset,
1736 CUDAContextScope scope(
this);
1738 CUfunction cuFilterWriteFeature;
1739 cuda_assert(cuModuleGetFunction(
1740 &cuFilterWriteFeature, cuFilterModule,
"kernel_cuda_filter_write_feature"));
1741 cuda_assert(cuFuncSetCacheConfig(cuFilterWriteFeature, CU_FUNC_CACHE_PREFER_L1));
1742 CUDA_GET_BLOCKSIZE(cuFilterWriteFeature,
task->filter_area.z,
task->filter_area.w);
1744 void *args[] = {&
task->render_buffer.samples,
1745 &
task->reconstruction_state.buffer_params,
1751 CUDA_LAUNCH_KERNEL(cuFilterWriteFeature, args);
1752 cuda_assert(cuCtxSynchronize());
1754 return !have_error();
1757 bool CUDADevice::denoising_detect_outliers(
device_ptr image_ptr,
1766 CUDAContextScope scope(
this);
1768 CUfunction cuFilterDetectOutliers;
1769 cuda_assert(cuModuleGetFunction(
1770 &cuFilterDetectOutliers, cuFilterModule,
"kernel_cuda_filter_detect_outliers"));
1771 cuda_assert(cuFuncSetCacheConfig(cuFilterDetectOutliers, CU_FUNC_CACHE_PREFER_L1));
1773 cuFilterDetectOutliers,
task->rect.z -
task->rect.x,
task->rect.w -
task->rect.y);
1776 &image_ptr, &variance_ptr, &depth_ptr, &output_ptr, &
task->rect, &
task->buffer.pass_stride};
1778 CUDA_LAUNCH_KERNEL(cuFilterDetectOutliers, args);
1779 cuda_assert(cuCtxSynchronize());
1781 return !have_error();
1787 &CUDADevice::denoising_construct_transform,
this, &denoising);
1789 &CUDADevice::denoising_accumulate,
this, _1, _2, _3, _4, &denoising);
1792 &CUDADevice::denoising_divide_shadow,
this, _1, _2, _3, _4, _5, &denoising);
1794 &CUDADevice::denoising_non_local_means,
this, _1, _2, _3, _4, &denoising);
1796 &CUDADevice::denoising_combine_halves,
this, _1, _2, _3, _4, _5, _6, &denoising);
1798 &CUDADevice::denoising_get_feature,
this, _1, _2, _3, _4, _5, &denoising);
1800 &CUDADevice::denoising_write_feature,
this, _1, _2, _3, &denoising);
1802 &CUDADevice::denoising_detect_outliers,
this, _1, _2, _3, _4, &denoising);
1811 void CUDADevice::adaptive_sampling_filter(
uint filter_sample,
1813 CUdeviceptr d_wtile,
1816 const int num_threads_per_block = functions.adaptive_num_threads_per_block;
1820 uint total_work_size = wtile->
h * wtile->
w;
1821 void *args2[] = {&d_wtile, &filter_sample, &total_work_size};
1822 uint num_blocks =
divide_up(total_work_size, num_threads_per_block);
1823 cuda_assert(cuLaunchKernel(functions.adaptive_stopping,
1827 num_threads_per_block,
1834 total_work_size = wtile->
h;
1835 num_blocks =
divide_up(total_work_size, num_threads_per_block);
1836 cuda_assert(cuLaunchKernel(functions.adaptive_filter_x,
1840 num_threads_per_block,
1847 total_work_size = wtile->
w;
1848 num_blocks =
divide_up(total_work_size, num_threads_per_block);
1849 cuda_assert(cuLaunchKernel(functions.adaptive_filter_y,
1853 num_threads_per_block,
1862 void CUDADevice::adaptive_sampling_post(
RenderTile &rtile,
1864 CUdeviceptr d_wtile,
1867 const int num_threads_per_block = functions.adaptive_num_threads_per_block;
1868 uint total_work_size = wtile->
h * wtile->
w;
1871 uint num_blocks =
divide_up(total_work_size, num_threads_per_block);
1872 cuda_assert(cuLaunchKernel(functions.adaptive_scale_samples,
1876 num_threads_per_block,
1892 CUDAContextScope scope(
this);
1893 CUfunction cuRender;
1897 cuda_assert(cuModuleGetFunction(&cuRender, cuModule,
"kernel_cuda_bake"));
1899 else if (
task.integrator_branched) {
1900 cuda_assert(cuModuleGetFunction(&cuRender, cuModule,
"kernel_cuda_branched_path_trace"));
1903 cuda_assert(cuModuleGetFunction(&cuRender, cuModule,
"kernel_cuda_path_trace"));
1910 cuda_assert(cuFuncSetCacheConfig(cuRender, CU_FUNC_CACHE_PREFER_L1));
1913 work_tiles.
alloc(1);
1927 int min_blocks, num_threads_per_block;
1929 cuOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, cuRender,
NULL, 0, 0));
1934 uint step_samples =
divide_up(min_blocks * num_threads_per_block, wtile->
w * wtile->
h);
1940 for (
int sample = start_sample;
sample < end_sample;) {
1944 if (
task.adaptive_sampling.use) {
1950 CUdeviceptr d_work_tiles = (CUdeviceptr)work_tiles.
device_pointer;
1952 uint num_blocks =
divide_up(total_work_size, num_threads_per_block);
1955 void *args[] = {&d_work_tiles, &total_work_size};
1958 cuLaunchKernel(cuRender, num_blocks, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0));
1962 if (
task.adaptive_sampling.use &&
task.adaptive_sampling.need_filter(filter_sample)) {
1963 adaptive_sampling_filter(filter_sample, wtile, d_work_tiles);
1966 cuda_assert(cuCtxSynchronize());
1973 if (
task.get_cancel()) {
1974 if (
task.need_finish_queue ==
false)
1980 if (
task.adaptive_sampling.use) {
1981 CUdeviceptr d_work_tiles = (CUdeviceptr)work_tiles.
device_pointer;
1982 adaptive_sampling_post(rtile, wtile, d_work_tiles);
1983 cuda_assert(cuCtxSynchronize());
1996 CUDAContextScope scope(
this);
1998 CUfunction cuFilmConvert;
1999 CUdeviceptr d_rgba = map_pixels((rgba_byte) ? rgba_byte : rgba_half);
2000 CUdeviceptr d_buffer = (CUdeviceptr)
buffer;
2005 cuModuleGetFunction(&cuFilmConvert, cuModule,
"kernel_cuda_convert_to_half_float"));
2008 cuda_assert(cuModuleGetFunction(&cuFilmConvert, cuModule,
"kernel_cuda_convert_to_byte"));
2011 float sample_scale = 1.0f / (
task.sample + 1);
2014 void *args[] = {&d_rgba,
2025 int threads_per_block;
2026 cuda_assert(cuFuncGetAttribute(
2027 &threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuFilmConvert));
2029 int xthreads = (int)
sqrt(threads_per_block);
2030 int ythreads = (int)
sqrt(threads_per_block);
2031 int xblocks = (
task.w + xthreads - 1) / xthreads;
2032 int yblocks = (
task.h + ythreads - 1) / ythreads;
2034 cuda_assert(cuFuncSetCacheConfig(cuFilmConvert, CU_FUNC_CACHE_PREFER_L1));
2036 cuda_assert(cuLaunchKernel(cuFilmConvert,
2048 unmap_pixels((rgba_byte) ? rgba_byte : rgba_half);
2050 cuda_assert(cuCtxSynchronize());
2058 CUDAContextScope scope(
this);
2060 CUfunction cuShader;
2061 CUdeviceptr d_input = (CUdeviceptr)
task.shader_input;
2062 CUdeviceptr d_output = (CUdeviceptr)
task.shader_output;
2066 cuda_assert(cuModuleGetFunction(&cuShader, cuModule,
"kernel_cuda_displace"));
2069 cuda_assert(cuModuleGetFunction(&cuShader, cuModule,
"kernel_cuda_background"));
2073 const int shader_chunk_size = 65536;
2074 const int start =
task.shader_x;
2075 const int end =
task.shader_x +
task.shader_w;
2076 int offset =
task.offset;
2078 bool canceled =
false;
2080 for (
int shader_x = start; shader_x < end; shader_x += shader_chunk_size) {
2081 int shader_w =
min(shader_chunk_size, end - shader_x);
2086 args[arg++] = &d_input;
2087 args[arg++] = &d_output;
2088 args[arg++] = &
task.shader_eval_type;
2090 args[arg++] = &
task.shader_filter;
2092 args[arg++] = &shader_x;
2093 args[arg++] = &shader_w;
2094 args[arg++] = &offset;
2098 int threads_per_block;
2099 cuda_assert(cuFuncGetAttribute(
2100 &threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuShader));
2102 int xblocks = (shader_w + threads_per_block - 1) / threads_per_block;
2104 cuda_assert(cuFuncSetCacheConfig(cuShader, CU_FUNC_CACHE_PREFER_L1));
2105 cuda_assert(cuLaunchKernel(cuShader,
2117 cuda_assert(cuCtxSynchronize());
2119 if (
task.get_cancel()) {
2129 CUdeviceptr CUDADevice::map_pixels(
device_ptr mem)
2132 PixelMem pmem = pixel_mem_map[mem];
2136 cuda_assert(cuGraphicsMapResources(1, &pmem.cuPBOresource, 0));
2137 cuda_assert(cuGraphicsResourceGetMappedPointer(&
buffer, &bytes, pmem.cuPBOresource));
2142 return (CUdeviceptr)mem;
2145 void CUDADevice::unmap_pixels(
device_ptr mem)
2148 PixelMem pmem = pixel_mem_map[mem];
2150 cuda_assert(cuGraphicsUnmapResources(1, &pmem.cuPBOresource, 0));
2161 CUDAContextScope scope(
this);
2163 glGenBuffers(1, &pmem.cuPBO);
2164 glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
2167 GL_PIXEL_UNPACK_BUFFER, pmem.w * pmem.h *
sizeof(GLhalf) * 4,
NULL, GL_DYNAMIC_DRAW);
2170 GL_PIXEL_UNPACK_BUFFER, pmem.w * pmem.h *
sizeof(
uint8_t) * 4,
NULL, GL_DYNAMIC_DRAW);
2172 glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
2174 glActiveTexture(GL_TEXTURE0);
2175 glGenTextures(1, &pmem.cuTexId);
2176 glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
2178 glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F, pmem.w, pmem.h, 0, GL_RGBA, GL_HALF_FLOAT,
NULL);
2180 glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, pmem.w, pmem.h, 0, GL_RGBA, GL_UNSIGNED_BYTE,
NULL);
2181 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
2182 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
2183 glBindTexture(GL_TEXTURE_2D, 0);
2185 CUresult
result = cuGraphicsGLRegisterBuffer(
2186 &pmem.cuPBOresource, pmem.cuPBO, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE);
2188 if (
result == CUDA_SUCCESS) {
2199 glDeleteBuffers(1, &pmem.cuPBO);
2200 glDeleteTextures(1, &pmem.cuTexId);
2206 void CUDADevice::pixels_copy_from(
device_memory &mem,
int y,
int w,
int h)
2210 CUDAContextScope scope(
this);
2212 glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
2213 uchar *pixels = (
uchar *)glMapBuffer(GL_PIXEL_UNPACK_BUFFER, GL_READ_ONLY);
2214 size_t offset =
sizeof(
uchar) * 4 *
y *
w;
2216 glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
2217 glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
2225 CUDAContextScope scope(
this);
2227 cuda_assert(cuGraphicsUnregisterResource(pmem.cuPBOresource));
2228 glDeleteBuffers(1, &pmem.cuPBO);
2229 glDeleteTextures(1, &pmem.cuTexId);
2259 CUDAContextScope scope(
this);
2263 size_t offset = 4 *
y *
w;
2266 offset *=
sizeof(GLhalf);
2270 glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
2271 glActiveTexture(GL_TEXTURE0);
2272 glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
2274 glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0,
w, h, GL_RGBA, GL_HALF_FLOAT, (
void *)offset);
2277 glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0,
w, h, GL_RGBA, GL_UNSIGNED_BYTE, (
void *)offset);
2279 glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
2283 glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_ALPHA);
2286 GLint shader_program;
2287 if (use_fallback_shader) {
2288 if (!bind_fallback_display_space_shader(dw, dh)) {
2291 shader_program = fallback_shader_program;
2295 glGetIntegerv(GL_CURRENT_PROGRAM, &shader_program);
2298 if (!vertex_buffer) {
2299 glGenBuffers(1, &vertex_buffer);
2302 glBindBuffer(GL_ARRAY_BUFFER, vertex_buffer);
2305 glBufferData(GL_ARRAY_BUFFER, 16 *
sizeof(
float),
NULL, GL_STREAM_DRAW);
2307 vpointer = (
float *)glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY);
2316 vpointer[4] = (
float)
w / (
float)pmem.w;
2321 vpointer[8] = (
float)
w / (
float)pmem.w;
2322 vpointer[9] = (
float)h / (
float)pmem.h;
2326 vpointer[12] = 0.0f;
2327 vpointer[13] = (
float)h / (
float)pmem.h;
2331 glUnmapBuffer(GL_ARRAY_BUFFER);
2334 GLuint vertex_array_object;
2335 GLuint position_attribute, texcoord_attribute;
2337 glGenVertexArrays(1, &vertex_array_object);
2338 glBindVertexArray(vertex_array_object);
2340 texcoord_attribute = glGetAttribLocation(shader_program,
"texCoord");
2341 position_attribute = glGetAttribLocation(shader_program,
"pos");
2343 glEnableVertexAttribArray(texcoord_attribute);
2344 glEnableVertexAttribArray(position_attribute);
2346 glVertexAttribPointer(
2347 texcoord_attribute, 2, GL_FLOAT, GL_FALSE, 4 *
sizeof(
float), (
const GLvoid *)0);
2348 glVertexAttribPointer(position_attribute,
2353 (
const GLvoid *)(
sizeof(
float) * 2));
2355 glDrawArrays(GL_TRIANGLE_FAN, 0, 4);
2357 if (use_fallback_shader) {
2368 glBindTexture(GL_TEXTURE_2D, 0);
2373 Device::draw_pixels(mem,
y,
w, h,
width,
height, dx, dy, dw, dh, transparent, draw_params);
2378 CUDAContextScope scope(
this);
2382 if (use_split_kernel()) {
2383 if (split_kernel ==
NULL) {
2384 split_kernel =
new CUDASplitKernel(
this);
2385 split_kernel->load_kernels(requested_features);
2395 while (
task.acquire_tile(
this, tile,
task.tile_types)) {
2397 if (use_split_kernel()) {
2399 split_kernel->path_trace(
task, tile, void_buffer, void_buffer);
2402 render(
task, tile, work_tiles);
2406 render(
task, tile, work_tiles);
2411 denoise(tile, denoising);
2413 task.update_progress(&tile, tile.
w * tile.
h);
2416 task.release_tile(tile);
2418 if (
task.get_cancel()) {
2419 if (
task.need_finish_queue ==
false)
2429 cuda_assert(cuCtxSynchronize());
2446 denoise(tile, denoising);
2447 task.update_progress(&tile, tile.
w * tile.
h);
2453 CUDAContextScope scope(
this);
2456 load_texture_info();
2459 cuda_assert(cuCtxSynchronize());
2468 thread_run(task_copy);
2473 void CUDADevice::task_wait()
2478 void CUDADevice::task_cancel()
2487 # define cuda_assert(stmt) \
2489 CUresult result = stmt; \
2490 if (result != CUDA_SUCCESS) { \
2491 const char *name = cuewErrorString(result); \
2492 device->set_error( \
2493 string_printf("%s in %s (device_cuda_impl.cpp:%d)", name, #stmt, __LINE__)); \
2500 CUDAContextScope::CUDAContextScope(CUDADevice *device) : device(device)
2502 cuda_assert(cuCtxPushCurrent(device->cuContext));
2505 CUDAContextScope::~CUDAContextScope()
2507 cuda_assert(cuCtxPopCurrent(
NULL));
2517 CUDASplitKernelFunction(CUDADevice *device, CUfunction func) : device(device), func(func)
2530 if (device->have_error())
2533 CUDAContextScope scope(device);
2536 int threads_per_block;
2538 cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func));
2543 cuda_assert(cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_L1));
2545 cuda_assert(cuLaunchKernel(func,
2557 return !device->have_error();
2561 CUDASplitKernel::CUDASplitKernel(CUDADevice *device) :
DeviceSplitKernel(device), device(device)
2569 CUDAContextScope scope(device);
2572 size_buffer.alloc(1);
2573 size_buffer.zero_to_device();
2576 CUdeviceptr d_size = (CUdeviceptr)size_buffer.device_pointer;
2583 args_t args = {&
threads, &d_size};
2585 CUfunction state_buffer_size;
2587 cuModuleGetFunction(&state_buffer_size, device->cuModule,
"kernel_cuda_state_buffer_size"));
2589 cuda_assert(cuLaunchKernel(state_buffer_size, 1, 1, 1, 1, 1, 1, 0, 0, (
void **)&args, 0));
2591 size_buffer.copy_from_device(0, 1, 1);
2592 size_t size = size_buffer[0];
2598 bool CUDASplitKernel::enqueue_split_kernel_data_init(
const KernelDimensions &dim,
2600 int num_global_elements,
2609 CUDAContextScope scope(device);
2611 CUdeviceptr d_split_data = (CUdeviceptr)split_data.
device_pointer;
2612 CUdeviceptr d_ray_state = (CUdeviceptr)
ray_state.device_pointer;
2613 CUdeviceptr d_queue_index = (CUdeviceptr)
queue_index.device_pointer;
2614 CUdeviceptr d_use_queues_flag = (CUdeviceptr)
use_queues_flag.device_pointer;
2615 CUdeviceptr d_work_pool_wgs = (CUdeviceptr)work_pool_wgs.
device_pointer;
2617 CUdeviceptr d_buffer = (CUdeviceptr)rtile.
buffer;
2637 CUdeviceptr *work_pool_wgs;
2642 args_t args = {&d_split_data,
2643 &num_global_elements,
2662 cuModuleGetFunction(&
data_init, device->cuModule,
"kernel_cuda_path_trace_data_init"));
2663 if (device->have_error()) {
2667 CUDASplitKernelFunction(device,
data_init).enqueue(dim, (
void **)&args);
2669 return !device->have_error();
2672 SplitKernelFunction *CUDASplitKernel::get_split_kernel_function(
const string &kernel_name,
2675 const CUDAContextScope scope(device);
2678 const CUresult
result = cuModuleGetFunction(
2679 &func, device->cuModule, (
string(
"kernel_cuda_") + kernel_name).data());
2680 if (
result != CUDA_SUCCESS) {
2681 device->set_error(
string_printf(
"Could not find kernel \"kernel_cuda_%s\" in module (%s)",
2683 cuewErrorString(
result)));
2687 return new CUDASplitKernelFunction(device, func);
2690 int2 CUDASplitKernel::split_kernel_local_size()
2699 CUDAContextScope scope(device);
2703 cuda_assert(cuMemGetInfo(&
free, &total));
2708 size_t num_elements = max_elements_for_max_buffer_size(
kg,
data,
free / 2);
2711 VLOG(1) <<
"Global size: " << global_size <<
".";
typedef float(TangentPoint)[2]
void BLI_kdtree_nd_() free(KDTree *tree)
_GL_VOID GLfloat value _GL_VOID_RET _GL_VOID const GLuint GLboolean *residences _GL_BOOL_RET _GL_VOID GLsizei GLfloat GLfloat GLfloat GLfloat const GLubyte *bitmap _GL_VOID_RET _GL_VOID GLenum const void *lists _GL_VOID_RET _GL_VOID const GLdouble *equation _GL_VOID_RET _GL_VOID GLdouble GLdouble blue _GL_VOID_RET _GL_VOID GLfloat GLfloat blue _GL_VOID_RET _GL_VOID GLint GLint blue _GL_VOID_RET _GL_VOID GLshort GLshort blue _GL_VOID_RET _GL_VOID GLubyte GLubyte blue _GL_VOID_RET _GL_VOID GLuint GLuint blue _GL_VOID_RET _GL_VOID GLushort GLushort blue _GL_VOID_RET _GL_VOID GLbyte GLbyte GLbyte alpha _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble alpha _GL_VOID_RET _GL_VOID GLfloat GLfloat GLfloat alpha _GL_VOID_RET _GL_VOID GLint GLint GLint alpha _GL_VOID_RET _GL_VOID GLshort GLshort GLshort alpha _GL_VOID_RET _GL_VOID GLubyte GLubyte GLubyte alpha _GL_VOID_RET _GL_VOID GLuint GLuint GLuint alpha _GL_VOID_RET _GL_VOID GLushort GLushort GLushort alpha _GL_VOID_RET _GL_VOID GLenum mode _GL_VOID_RET _GL_VOID GLint GLsizei GLsizei GLenum type _GL_VOID_RET _GL_VOID GLsizei GLenum GLenum const void *pixels _GL_VOID_RET _GL_VOID const void *pointer _GL_VOID_RET _GL_VOID GLdouble v _GL_VOID_RET _GL_VOID GLfloat v _GL_VOID_RET _GL_VOID GLint GLint i2 _GL_VOID_RET _GL_VOID GLint j _GL_VOID_RET _GL_VOID GLfloat param _GL_VOID_RET _GL_VOID GLint param _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble GLdouble GLdouble zFar _GL_VOID_RET _GL_UINT GLdouble *equation _GL_VOID_RET _GL_VOID GLenum GLint *params _GL_VOID_RET _GL_VOID GLenum GLfloat *v _GL_VOID_RET _GL_VOID GLenum GLfloat *params _GL_VOID_RET _GL_VOID GLfloat *values _GL_VOID_RET _GL_VOID GLushort *values _GL_VOID_RET _GL_VOID GLenum GLfloat *params _GL_VOID_RET _GL_VOID GLenum GLdouble *params _GL_VOID_RET _GL_VOID GLenum GLint *params _GL_VOID_RET _GL_VOID GLsizei const void *pointer _GL_VOID_RET _GL_VOID GLsizei const void *pointer _GL_VOID_RET _GL_BOOL GLfloat param _GL_VOID_RET _GL_VOID GLint param _GL_VOID_RET _GL_VOID GLenum GLfloat param _GL_VOID_RET _GL_VOID GLenum GLint param _GL_VOID_RET _GL_VOID GLushort pattern _GL_VOID_RET _GL_VOID GLdouble GLdouble GLint GLint const GLdouble *points _GL_VOID_RET _GL_VOID GLdouble GLdouble GLint GLint GLdouble GLdouble GLint GLint const GLdouble *points _GL_VOID_RET _GL_VOID GLdouble GLdouble u2 _GL_VOID_RET _GL_VOID GLdouble GLdouble GLint GLdouble GLdouble v2 _GL_VOID_RET _GL_VOID GLenum GLfloat param _GL_VOID_RET _GL_VOID GLenum GLint param _GL_VOID_RET _GL_VOID GLenum mode _GL_VOID_RET _GL_VOID GLdouble GLdouble nz _GL_VOID_RET _GL_VOID GLfloat GLfloat nz _GL_VOID_RET _GL_VOID GLint GLint nz _GL_VOID_RET _GL_VOID GLshort GLshort nz _GL_VOID_RET _GL_VOID GLsizei const void *pointer _GL_VOID_RET _GL_VOID GLsizei const GLfloat *values _GL_VOID_RET _GL_VOID GLsizei const GLushort *values _GL_VOID_RET _GL_VOID GLint param _GL_VOID_RET _GL_VOID const GLuint const GLclampf *priorities _GL_VOID_RET _GL_VOID GLdouble y _GL_VOID_RET _GL_VOID GLfloat y _GL_VOID_RET _GL_VOID GLint y _GL_VOID_RET _GL_VOID GLshort y _GL_VOID_RET _GL_VOID GLdouble GLdouble z _GL_VOID_RET _GL_VOID GLfloat GLfloat z _GL_VOID_RET _GL_VOID GLint GLint z _GL_VOID_RET _GL_VOID GLshort GLshort z _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble w _GL_VOID_RET _GL_VOID GLfloat GLfloat GLfloat w _GL_VOID_RET _GL_VOID GLint GLint GLint w _GL_VOID_RET _GL_VOID GLshort GLshort GLshort w _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble y2 _GL_VOID_RET _GL_VOID GLfloat GLfloat GLfloat y2 _GL_VOID_RET _GL_VOID GLint GLint GLint y2 _GL_VOID_RET _GL_VOID GLshort GLshort GLshort y2 _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble z _GL_VOID_RET _GL_VOID GLdouble GLdouble z _GL_VOID_RET _GL_VOID GLuint *buffer _GL_VOID_RET _GL_VOID GLdouble t _GL_VOID_RET _GL_VOID GLfloat t _GL_VOID_RET _GL_VOID GLint t _GL_VOID_RET _GL_VOID GLshort t _GL_VOID_RET _GL_VOID GLdouble GLdouble r _GL_VOID_RET _GL_VOID GLfloat GLfloat r _GL_VOID_RET _GL_VOID GLint GLint r _GL_VOID_RET _GL_VOID GLshort GLshort r _GL_VOID_RET _GL_VOID GLdouble GLdouble r
_GL_VOID GLfloat value _GL_VOID_RET _GL_VOID const GLuint GLboolean *residences _GL_BOOL_RET _GL_VOID GLsizei GLfloat GLfloat GLfloat GLfloat const GLubyte *bitmap _GL_VOID_RET _GL_VOID GLenum const void *lists _GL_VOID_RET _GL_VOID const GLdouble *equation _GL_VOID_RET _GL_VOID GLdouble GLdouble blue _GL_VOID_RET _GL_VOID GLfloat GLfloat blue _GL_VOID_RET _GL_VOID GLint GLint blue _GL_VOID_RET _GL_VOID GLshort GLshort blue _GL_VOID_RET _GL_VOID GLubyte GLubyte blue _GL_VOID_RET _GL_VOID GLuint GLuint blue _GL_VOID_RET _GL_VOID GLushort GLushort blue _GL_VOID_RET _GL_VOID GLbyte GLbyte GLbyte alpha _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble alpha _GL_VOID_RET _GL_VOID GLfloat GLfloat GLfloat alpha _GL_VOID_RET _GL_VOID GLint GLint GLint alpha _GL_VOID_RET _GL_VOID GLshort GLshort GLshort alpha _GL_VOID_RET _GL_VOID GLubyte GLubyte GLubyte alpha _GL_VOID_RET _GL_VOID GLuint GLuint GLuint alpha _GL_VOID_RET _GL_VOID GLushort GLushort GLushort alpha _GL_VOID_RET _GL_VOID GLenum mode _GL_VOID_RET _GL_VOID GLint GLsizei width
_GL_VOID GLfloat value _GL_VOID_RET _GL_VOID const GLuint GLboolean *residences _GL_BOOL_RET _GL_VOID GLsizei height
_GL_VOID GLfloat value _GL_VOID_RET _GL_VOID const GLuint GLboolean *residences _GL_BOOL_RET _GL_VOID GLsizei GLfloat GLfloat GLfloat GLfloat const GLubyte *bitmap _GL_VOID_RET _GL_VOID GLenum const void *lists _GL_VOID_RET _GL_VOID const GLdouble *equation _GL_VOID_RET _GL_VOID GLdouble GLdouble blue _GL_VOID_RET _GL_VOID GLfloat GLfloat blue _GL_VOID_RET _GL_VOID GLint GLint blue _GL_VOID_RET _GL_VOID GLshort GLshort blue _GL_VOID_RET _GL_VOID GLubyte GLubyte blue _GL_VOID_RET _GL_VOID GLuint GLuint blue _GL_VOID_RET _GL_VOID GLushort GLushort blue _GL_VOID_RET _GL_VOID GLbyte GLbyte GLbyte alpha _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble alpha _GL_VOID_RET _GL_VOID GLfloat GLfloat GLfloat alpha _GL_VOID_RET _GL_VOID GLint GLint GLint alpha _GL_VOID_RET _GL_VOID GLshort GLshort GLshort alpha _GL_VOID_RET _GL_VOID GLubyte GLubyte GLubyte alpha _GL_VOID_RET _GL_VOID GLuint GLuint GLuint alpha _GL_VOID_RET _GL_VOID GLushort GLushort GLushort alpha _GL_VOID_RET _GL_VOID GLenum mode _GL_VOID_RET _GL_VOID GLint y
_GL_VOID GLfloat value _GL_VOID_RET _GL_VOID const GLuint GLboolean *residences _GL_BOOL_RET _GL_VOID GLsizei GLfloat GLfloat GLfloat GLfloat const GLubyte *bitmap _GL_VOID_RET _GL_VOID GLenum const void *lists _GL_VOID_RET _GL_VOID const GLdouble *equation _GL_VOID_RET _GL_VOID GLdouble GLdouble blue _GL_VOID_RET _GL_VOID GLfloat GLfloat blue _GL_VOID_RET _GL_VOID GLint GLint blue _GL_VOID_RET _GL_VOID GLshort GLshort blue _GL_VOID_RET _GL_VOID GLubyte GLubyte blue _GL_VOID_RET _GL_VOID GLuint GLuint blue _GL_VOID_RET _GL_VOID GLushort GLushort blue _GL_VOID_RET _GL_VOID GLbyte GLbyte GLbyte alpha _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble alpha _GL_VOID_RET _GL_VOID GLfloat GLfloat GLfloat alpha _GL_VOID_RET _GL_VOID GLint GLint GLint alpha _GL_VOID_RET _GL_VOID GLshort GLshort GLshort alpha _GL_VOID_RET _GL_VOID GLubyte GLubyte GLubyte alpha _GL_VOID_RET _GL_VOID GLuint GLuint GLuint alpha _GL_VOID_RET _GL_VOID GLushort GLushort GLushort alpha _GL_VOID_RET _GL_VOID GLenum mode _GL_VOID_RET _GL_VOID GLint GLsizei GLsizei GLenum type _GL_VOID_RET _GL_VOID GLsizei GLenum GLenum const void *pixels _GL_VOID_RET _GL_VOID const void *pointer _GL_VOID_RET _GL_VOID GLdouble v _GL_VOID_RET _GL_VOID GLfloat v _GL_VOID_RET _GL_VOID GLint GLint i2 _GL_VOID_RET _GL_VOID GLint j _GL_VOID_RET _GL_VOID GLfloat param _GL_VOID_RET _GL_VOID GLint param _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble GLdouble GLdouble zFar _GL_VOID_RET _GL_UINT GLdouble *equation _GL_VOID_RET _GL_VOID GLenum GLint *params _GL_VOID_RET _GL_VOID GLenum GLfloat *v _GL_VOID_RET _GL_VOID GLenum GLfloat *params _GL_VOID_RET _GL_VOID GLfloat *values _GL_VOID_RET _GL_VOID GLushort *values _GL_VOID_RET _GL_VOID GLenum GLfloat *params _GL_VOID_RET _GL_VOID GLenum GLdouble *params _GL_VOID_RET _GL_VOID GLenum GLint *params _GL_VOID_RET _GL_VOID GLsizei stride
_GL_VOID GLfloat value _GL_VOID_RET _GL_VOID const GLuint GLboolean *residences _GL_BOOL_RET _GL_VOID GLsizei GLfloat GLfloat GLfloat GLfloat const GLubyte *bitmap _GL_VOID_RET _GL_VOID GLenum const void *lists _GL_VOID_RET _GL_VOID const GLdouble *equation _GL_VOID_RET _GL_VOID GLdouble GLdouble blue _GL_VOID_RET _GL_VOID GLfloat GLfloat blue _GL_VOID_RET _GL_VOID GLint GLint blue _GL_VOID_RET _GL_VOID GLshort GLshort blue _GL_VOID_RET _GL_VOID GLubyte GLubyte blue _GL_VOID_RET _GL_VOID GLuint GLuint blue _GL_VOID_RET _GL_VOID GLushort GLushort blue _GL_VOID_RET _GL_VOID GLbyte GLbyte GLbyte alpha _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble alpha _GL_VOID_RET _GL_VOID GLfloat GLfloat GLfloat alpha _GL_VOID_RET _GL_VOID GLint GLint GLint alpha _GL_VOID_RET _GL_VOID GLshort GLshort GLshort alpha _GL_VOID_RET _GL_VOID GLubyte GLubyte GLubyte alpha _GL_VOID_RET _GL_VOID GLuint GLuint GLuint alpha _GL_VOID_RET _GL_VOID GLushort GLushort GLushort alpha _GL_VOID_RET _GL_VOID GLenum mode _GL_VOID_RET _GL_VOID GLint GLsizei GLsizei GLenum type _GL_VOID_RET _GL_VOID GLsizei GLenum GLenum const void *pixels _GL_VOID_RET _GL_VOID const void *pointer _GL_VOID_RET _GL_VOID GLdouble v _GL_VOID_RET _GL_VOID GLfloat v _GL_VOID_RET _GL_VOID GLint GLint i2 _GL_VOID_RET _GL_VOID GLint j _GL_VOID_RET _GL_VOID GLfloat param _GL_VOID_RET _GL_VOID GLint param _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble GLdouble GLdouble zFar _GL_VOID_RET _GL_UINT GLdouble *equation _GL_VOID_RET _GL_VOID GLenum GLint *params _GL_VOID_RET _GL_VOID GLenum GLfloat *v _GL_VOID_RET _GL_VOID GLenum GLfloat *params _GL_VOID_RET _GL_VOID GLfloat *values _GL_VOID_RET _GL_VOID GLushort *values _GL_VOID_RET _GL_VOID GLenum GLfloat *params _GL_VOID_RET _GL_VOID GLenum GLdouble *params _GL_VOID_RET _GL_VOID GLenum GLint *params _GL_VOID_RET _GL_VOID GLsizei const void *pointer _GL_VOID_RET _GL_VOID GLsizei const void *pointer _GL_VOID_RET _GL_BOOL GLfloat param _GL_VOID_RET _GL_VOID GLint param _GL_VOID_RET _GL_VOID GLenum GLfloat param _GL_VOID_RET _GL_VOID GLenum GLint param _GL_VOID_RET _GL_VOID GLushort pattern _GL_VOID_RET _GL_VOID GLdouble GLdouble GLint GLint const GLdouble *points _GL_VOID_RET _GL_VOID GLdouble GLdouble GLint GLint GLdouble GLdouble GLint GLint const GLdouble *points _GL_VOID_RET _GL_VOID GLdouble GLdouble u2 _GL_VOID_RET _GL_VOID GLdouble GLdouble GLint GLdouble GLdouble v2 _GL_VOID_RET _GL_VOID GLenum GLfloat param _GL_VOID_RET _GL_VOID GLenum GLint param _GL_VOID_RET _GL_VOID GLenum mode _GL_VOID_RET _GL_VOID GLdouble GLdouble nz _GL_VOID_RET _GL_VOID GLfloat GLfloat nz _GL_VOID_RET _GL_VOID GLint GLint nz _GL_VOID_RET _GL_VOID GLshort GLshort nz _GL_VOID_RET _GL_VOID GLsizei const void *pointer _GL_VOID_RET _GL_VOID GLsizei const GLfloat *values _GL_VOID_RET _GL_VOID GLsizei const GLushort *values _GL_VOID_RET _GL_VOID GLint param _GL_VOID_RET _GL_VOID const GLuint const GLclampf *priorities _GL_VOID_RET _GL_VOID GLdouble y _GL_VOID_RET _GL_VOID GLfloat y _GL_VOID_RET _GL_VOID GLint y _GL_VOID_RET _GL_VOID GLshort y _GL_VOID_RET _GL_VOID GLdouble GLdouble z _GL_VOID_RET _GL_VOID GLfloat GLfloat z _GL_VOID_RET _GL_VOID GLint GLint z _GL_VOID_RET _GL_VOID GLshort GLshort z _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble w _GL_VOID_RET _GL_VOID GLfloat GLfloat GLfloat w _GL_VOID_RET _GL_VOID GLint GLint GLint w _GL_VOID_RET _GL_VOID GLshort GLshort GLshort w _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble y2 _GL_VOID_RET _GL_VOID GLfloat GLfloat GLfloat y2 _GL_VOID_RET _GL_VOID GLint GLint GLint y2 _GL_VOID_RET _GL_VOID GLshort GLshort GLshort y2 _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble z _GL_VOID_RET _GL_VOID GLdouble GLdouble z _GL_VOID_RET _GL_VOID GLuint *buffer _GL_VOID_RET _GL_VOID GLdouble t _GL_VOID_RET _GL_VOID GLfloat t _GL_VOID_RET _GL_VOID GLint t _GL_VOID_RET _GL_VOID GLshort t _GL_VOID_RET _GL_VOID GLdouble t
static DBVT_INLINE btScalar size(const btDbvtVolume &a)
SIMD_FORCE_INLINE const btScalar & w() const
Return the w value.
void run_denoising(RenderTile &tile)
struct DenoisingTask::RenderBuffers render_buffer
struct DenoisingTask::DeviceFunctions functions
struct DenoisingTask::DenoiseBuffers buffer
bool use_integrator_branched
string get_build_options() const
virtual bool enqueue_split_kernel_data_init(const KernelDimensions &dim, RenderTile &rtile, int num_global_elements, device_memory &kernel_globals, device_memory &kernel_data_, device_memory &split_data, device_memory &ray_state, device_memory &queue_index, device_memory &use_queues_flag, device_memory &work_pool_wgs)=0
virtual SplitKernelFunction * get_split_kernel_function(const string &kernel_name, const DeviceRequestedFeatures &)=0
virtual int2 split_kernel_local_size()=0
virtual uint64_t state_buffer_size(device_memory &kg, device_memory &data, size_t num_threads)=0
virtual int2 split_kernel_global_size(device_memory &kg, device_memory &data, DeviceTask &task)=0
virtual void draw_pixels(device_memory &mem, int y, int w, int h, int width, int height, int dx, int dy, int dw, int dh, bool transparent, const DeviceDrawParams &draw_params)
virtual void set_error(const string &error)
virtual bool enqueue(const KernelDimensions &dim, device_memory &kg, device_memory &data)=0
void mem_free(size_t size)
void mem_alloc(size_t size)
bool is_resident(Device *sub_device) const
size_t memory_elements_size(int elements)
device_ptr device_pointer
T * alloc(size_t width, size_t height=0, size_t depth=0)
static size_t datatype_size(DataType datatype)
DO_INLINE void filter(lfVector *V, fmatrix3x3 *S)
#define CCL_NAMESPACE_END
#define make_int4(x, y, z, w)
void KERNEL_FUNCTION_FULL_NAME() data_init(KernelGlobals *kg, ccl_constant KernelData *data, ccl_global void *split_data_buffer, int num_elements, ccl_global char *ray_state, int start_sample, int end_sample, int sx, int sy, int sw, int sh, int offset, int stride, ccl_global int *Queue_index, int queuesize, ccl_global char *use_queues_flag, ccl_global unsigned int *work_pool_wgs, unsigned int num_samples, ccl_global float *buffer)
void KERNEL_FUNCTION_FULL_NAME() shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int filter, int i, int offset, int sample)
__kernel void ccl_constant KernelData ccl_global void ccl_global char ccl_global int ccl_global char ccl_global unsigned int ccl_global float * buffer
__kernel void ccl_constant KernelData ccl_global void ccl_global char ccl_global int ccl_global char * use_queues_flag
__kernel void ccl_constant KernelData ccl_global void ccl_global char ccl_global int * queue_index
__kernel void ccl_constant KernelData ccl_global void * split_data_buffer
__kernel void ccl_constant KernelData ccl_global void ccl_global char * ray_state
static void error(const char *str)
static void sample(SocketReader *reader, int x, int y, float color[4])
ListBase threads
list of all thread for every CPUDevice in cpudevices a thread exists.
struct blender::compositor::@172::@174 task
void split(const std::string &s, const char delim, std::vector< std::string > &tokens)
unsigned __int64 uint64_t
function< bool(device_ptr a_ptr, device_ptr b_ptr, device_ptr mean_ptr, device_ptr variance_ptr, int r, int4 rect)> combine_halves
function< bool(device_ptr image_ptr, device_ptr variance_ptr, device_ptr depth_ptr, device_ptr output_ptr)> detect_outliers
function< bool(int out_offset, device_ptr frop_ptr, device_ptr buffer_ptr)> write_feature
function< bool(device_ptr output_ptr)> solve
function< bool(device_ptr a_ptr, device_ptr b_ptr, device_ptr sample_variance_ptr, device_ptr sv_variance_ptr, device_ptr buffer_variance_ptr)> divide_shadow
function< bool()> construct_transform
function< bool(int mean_offset, int variance_offset, device_ptr mean_ptr, device_ptr variance_ptr, float scale)> get_feature
function< bool(device_ptr color_ptr, device_ptr color_variance_ptr, device_ptr scale_ptr, int frame)> accumulate
function< bool(device_ptr image_ptr, device_ptr guide_ptr, device_ptr variance_ptr, device_ptr out_ptr)> non_local_means
function< void()> unbind_display_space_shader_cb
function< void()> bind_display_space_shader_cb
void push(TaskRunFunction &&task)
ccl_global float * buffer
DebugFlags & DebugFlags()
string util_md5_string(const string &str)
string path_cache_get(const string &sub)
string path_get(const string &sub)
string path_files_md5_hash(const string &dir)
string path_join(const string &dir, const string &file)
bool path_exists(const string &path)
void path_create_directories(const string &filepath)
bool path_read_text(const string &path, string &text)
string string_human_readable_size(size_t size)
string string_human_readable_number(size_t num)
CCL_NAMESPACE_BEGIN string string_printf(const char *format,...)
size_t system_physical_ram()
@ IMAGE_DATA_TYPE_NANOVDB_FLOAT
@ IMAGE_DATA_TYPE_NANOVDB_FLOAT3
std::unique_lock< std::mutex > thread_scoped_lock
CCL_NAMESPACE_BEGIN typedef std::mutex thread_mutex
CCL_NAMESPACE_BEGIN double time_dt()
ccl_device_inline size_t align_up(size_t offset, size_t alignment)
ccl_device_inline size_t round_down(size_t x, size_t multiple)
ccl_device_inline size_t divide_up(size_t x, size_t y)