Blender  V2.93
device_cuda_impl.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2011-2013 Blender Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifdef WITH_CUDA
18 
19 # include <climits>
20 # include <limits.h>
21 # include <stdio.h>
22 # include <stdlib.h>
23 # include <string.h>
24 
25 # include "device/cuda/device_cuda.h"
26 # include "device/device_intern.h"
28 
29 # include "render/buffers.h"
30 
32 
33 # include "util/util_debug.h"
34 # include "util/util_foreach.h"
35 # include "util/util_logging.h"
36 # include "util/util_map.h"
37 # include "util/util_md5.h"
38 # include "util/util_opengl.h"
39 # include "util/util_path.h"
40 # include "util/util_string.h"
41 # include "util/util_system.h"
42 # include "util/util_time.h"
43 # include "util/util_types.h"
44 # include "util/util_windows.h"
45 
47 
49 
50 # ifndef WITH_CUDA_DYNLOAD
51 
52 /* Transparently implement some functions, so majority of the file does not need
53  * to worry about difference between dynamically loaded and linked CUDA at all.
54  */
55 
56 namespace {
57 
58 const char *cuewErrorString(CUresult result)
59 {
60  /* We can only give error code here without major code duplication, that
61  * should be enough since dynamic loading is only being disabled by folks
62  * who knows what they're doing anyway.
63  *
64  * NOTE: Avoid call from several threads.
65  */
66  static string error;
67  error = string_printf("%d", result);
68  return error.c_str();
69 }
70 
71 const char *cuewCompilerPath()
72 {
73  return CYCLES_CUDA_NVCC_EXECUTABLE;
74 }
75 
76 int cuewCompilerVersion()
77 {
78  return (CUDA_VERSION / 100) + (CUDA_VERSION % 100 / 10);
79 }
80 
81 } /* namespace */
82 # endif /* WITH_CUDA_DYNLOAD */
83 
84 class CUDADevice;
85 
86 class CUDASplitKernel : public DeviceSplitKernel {
87  CUDADevice *device;
88 
89  public:
90  explicit CUDASplitKernel(CUDADevice *device);
91 
92  virtual uint64_t state_buffer_size(device_memory &kg, device_memory &data, size_t num_threads);
93 
94  virtual bool enqueue_split_kernel_data_init(const KernelDimensions &dim,
95  RenderTile &rtile,
96  int num_global_elements,
97  device_memory &kernel_globals,
98  device_memory &kernel_data_,
99  device_memory &split_data,
103  device_memory &work_pool_wgs);
104 
105  virtual SplitKernelFunction *get_split_kernel_function(const string &kernel_name,
106  const DeviceRequestedFeatures &);
107  virtual int2 split_kernel_local_size();
109 };
110 
111 /* Utility to push/pop CUDA context. */
112 class CUDAContextScope {
113  public:
114  CUDAContextScope(CUDADevice *device);
115  ~CUDAContextScope();
116 
117  private:
118  CUDADevice *device;
119 };
120 
121 bool CUDADevice::have_precompiled_kernels()
122 {
123  string cubins_path = path_get("lib");
124  return path_exists(cubins_path);
125 }
126 
127 bool CUDADevice::show_samples() const
128 {
129  /* The CUDADevice only processes one tile at a time, so showing samples is fine. */
130  return true;
131 }
132 
133 BVHLayoutMask CUDADevice::get_bvh_layout_mask() const
134 {
135  return BVH_LAYOUT_BVH2;
136 }
137 
138 void CUDADevice::set_error(const string &error)
139 {
141 
142  if (first_error) {
143  fprintf(stderr, "\nRefer to the Cycles GPU rendering documentation for possible solutions:\n");
144  fprintf(stderr,
145  "https://docs.blender.org/manual/en/latest/render/cycles/gpu_rendering.html\n\n");
146  first_error = false;
147  }
148 }
149 
150 # define cuda_assert(stmt) \
151  { \
152  CUresult result = stmt; \
153  if (result != CUDA_SUCCESS) { \
154  const char *name = cuewErrorString(result); \
155  set_error(string_printf("%s in %s (device_cuda_impl.cpp:%d)", name, #stmt, __LINE__)); \
156  } \
157  } \
158  (void)0
159 
160 CUDADevice::CUDADevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background_)
161  : Device(info, stats, profiler, background_), texture_info(this, "__texture_info", MEM_GLOBAL)
162 {
163  first_error = true;
164  background = background_;
165 
166  cuDevId = info.num;
167  cuDevice = 0;
168  cuContext = 0;
169 
170  cuModule = 0;
171  cuFilterModule = 0;
172 
173  split_kernel = NULL;
174 
175  need_texture_info = false;
176 
177  device_texture_headroom = 0;
178  device_working_headroom = 0;
179  move_texture_to_host = false;
180  map_host_limit = 0;
181  map_host_used = 0;
182  can_map_host = 0;
183  pitch_alignment = 0;
184 
185  functions.loaded = false;
186 
187  /* Initialize CUDA. */
188  CUresult result = cuInit(0);
189  if (result != CUDA_SUCCESS) {
190  set_error(string_printf("Failed to initialize CUDA runtime (%s)", cuewErrorString(result)));
191  return;
192  }
193 
194  /* Setup device and context. */
195  result = cuDeviceGet(&cuDevice, cuDevId);
196  if (result != CUDA_SUCCESS) {
197  set_error(string_printf("Failed to get CUDA device handle from ordinal (%s)",
198  cuewErrorString(result)));
199  return;
200  }
201 
202  /* CU_CTX_MAP_HOST for mapping host memory when out of device memory.
203  * CU_CTX_LMEM_RESIZE_TO_MAX for reserving local memory ahead of render,
204  * so we can predict which memory to map to host. */
205  cuda_assert(
206  cuDeviceGetAttribute(&can_map_host, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice));
207 
208  cuda_assert(cuDeviceGetAttribute(
209  &pitch_alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuDevice));
210 
211  unsigned int ctx_flags = CU_CTX_LMEM_RESIZE_TO_MAX;
212  if (can_map_host) {
213  ctx_flags |= CU_CTX_MAP_HOST;
214  init_host_memory();
215  }
216 
217  /* Create context. */
218  if (background) {
219  result = cuCtxCreate(&cuContext, ctx_flags, cuDevice);
220  }
221  else {
222  result = cuGLCtxCreate(&cuContext, ctx_flags, cuDevice);
223 
224  if (result != CUDA_SUCCESS) {
225  result = cuCtxCreate(&cuContext, ctx_flags, cuDevice);
226  background = true;
227  }
228  }
229 
230  if (result != CUDA_SUCCESS) {
231  set_error(string_printf("Failed to create CUDA context (%s)", cuewErrorString(result)));
232  return;
233  }
234 
235  int major, minor;
236  cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
237  cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
238  cuDevArchitecture = major * 100 + minor * 10;
239 
240  /* Pop context set by cuCtxCreate. */
241  cuCtxPopCurrent(NULL);
242 }
243 
244 CUDADevice::~CUDADevice()
245 {
246  task_pool.cancel();
247 
248  delete split_kernel;
249 
250  texture_info.free();
251 
252  cuda_assert(cuCtxDestroy(cuContext));
253 }
254 
255 bool CUDADevice::support_device(const DeviceRequestedFeatures & /*requested_features*/)
256 {
257  int major, minor;
258  cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
259  cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
260 
261  /* We only support sm_30 and above */
262  if (major < 3) {
263  set_error(string_printf(
264  "CUDA backend requires compute capability 3.0 or up, but found %d.%d.", major, minor));
265  return false;
266  }
267 
268  return true;
269 }
270 
271 bool CUDADevice::check_peer_access(Device *peer_device)
272 {
273  if (peer_device == this) {
274  return false;
275  }
276  if (peer_device->info.type != DEVICE_CUDA && peer_device->info.type != DEVICE_OPTIX) {
277  return false;
278  }
279 
280  CUDADevice *const peer_device_cuda = static_cast<CUDADevice *>(peer_device);
281 
282  int can_access = 0;
283  cuda_assert(cuDeviceCanAccessPeer(&can_access, cuDevice, peer_device_cuda->cuDevice));
284  if (can_access == 0) {
285  return false;
286  }
287 
288  // Ensure array access over the link is possible as well (for 3D textures)
289  cuda_assert(cuDeviceGetP2PAttribute(&can_access,
290  CU_DEVICE_P2P_ATTRIBUTE_ARRAY_ACCESS_ACCESS_SUPPORTED,
291  cuDevice,
292  peer_device_cuda->cuDevice));
293  if (can_access == 0) {
294  return false;
295  }
296 
297  // Enable peer access in both directions
298  {
299  const CUDAContextScope scope(this);
300  CUresult result = cuCtxEnablePeerAccess(peer_device_cuda->cuContext, 0);
301  if (result != CUDA_SUCCESS) {
302  set_error(string_printf("Failed to enable peer access on CUDA context (%s)",
303  cuewErrorString(result)));
304  return false;
305  }
306  }
307  {
308  const CUDAContextScope scope(peer_device_cuda);
309  CUresult result = cuCtxEnablePeerAccess(cuContext, 0);
310  if (result != CUDA_SUCCESS) {
311  set_error(string_printf("Failed to enable peer access on CUDA context (%s)",
312  cuewErrorString(result)));
313  return false;
314  }
315  }
316 
317  return true;
318 }
319 
320 bool CUDADevice::use_adaptive_compilation()
321 {
323 }
324 
325 bool CUDADevice::use_split_kernel()
326 {
327  return DebugFlags().cuda.split_kernel;
328 }
329 
330 /* Common NVCC flags which stays the same regardless of shading model,
331  * kernel sources md5 and only depends on compiler or compilation settings.
332  */
333 string CUDADevice::compile_kernel_get_common_cflags(
334  const DeviceRequestedFeatures &requested_features, bool filter, bool split)
335 {
336  const int machine = system_cpu_bits();
337  const string source_path = path_get("source");
338  const string include_path = source_path;
339  string cflags = string_printf(
340  "-m%d "
341  "--ptxas-options=\"-v\" "
342  "--use_fast_math "
343  "-DNVCC "
344  "-I\"%s\"",
345  machine,
346  include_path.c_str());
347  if (!filter && use_adaptive_compilation()) {
348  cflags += " " + requested_features.get_build_options();
349  }
350  const char *extra_cflags = getenv("CYCLES_CUDA_EXTRA_CFLAGS");
351  if (extra_cflags) {
352  cflags += string(" ") + string(extra_cflags);
353  }
354 # ifdef WITH_CYCLES_DEBUG
355  cflags += " -D__KERNEL_DEBUG__";
356 # endif
357 
358  if (split) {
359  cflags += " -D__SPLIT__";
360  }
361 
362 # ifdef WITH_NANOVDB
363  cflags += " -DWITH_NANOVDB";
364 # endif
365 
366  return cflags;
367 }
368 
369 string CUDADevice::compile_kernel(const DeviceRequestedFeatures &requested_features,
370  const char *name,
371  const char *base,
372  bool force_ptx)
373 {
374  /* Compute kernel name. */
375  int major, minor;
376  cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
377  cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
378 
379  /* Attempt to use kernel provided with Blender. */
380  if (!use_adaptive_compilation()) {
381  if (!force_ptx) {
382  const string cubin = path_get(string_printf("lib/%s_sm_%d%d.cubin", name, major, minor));
383  VLOG(1) << "Testing for pre-compiled kernel " << cubin << ".";
384  if (path_exists(cubin)) {
385  VLOG(1) << "Using precompiled kernel.";
386  return cubin;
387  }
388  }
389 
390  /* The driver can JIT-compile PTX generated for older generations, so find the closest one. */
391  int ptx_major = major, ptx_minor = minor;
392  while (ptx_major >= 3) {
393  const string ptx = path_get(
394  string_printf("lib/%s_compute_%d%d.ptx", name, ptx_major, ptx_minor));
395  VLOG(1) << "Testing for pre-compiled kernel " << ptx << ".";
396  if (path_exists(ptx)) {
397  VLOG(1) << "Using precompiled kernel.";
398  return ptx;
399  }
400 
401  if (ptx_minor > 0) {
402  ptx_minor--;
403  }
404  else {
405  ptx_major--;
406  ptx_minor = 9;
407  }
408  }
409  }
410 
411  /* Try to use locally compiled kernel. */
412  string source_path = path_get("source");
413  const string source_md5 = path_files_md5_hash(source_path);
414 
415  /* We include cflags into md5 so changing cuda toolkit or changing other
416  * compiler command line arguments makes sure cubin gets re-built.
417  */
418  string common_cflags = compile_kernel_get_common_cflags(
419  requested_features, strstr(name, "filter") != NULL, strstr(name, "split") != NULL);
420  const string kernel_md5 = util_md5_string(source_md5 + common_cflags);
421 
422  const char *const kernel_ext = force_ptx ? "ptx" : "cubin";
423  const char *const kernel_arch = force_ptx ? "compute" : "sm";
424  const string cubin_file = string_printf(
425  "cycles_%s_%s_%d%d_%s.%s", name, kernel_arch, major, minor, kernel_md5.c_str(), kernel_ext);
426  const string cubin = path_cache_get(path_join("kernels", cubin_file));
427  VLOG(1) << "Testing for locally compiled kernel " << cubin << ".";
428  if (path_exists(cubin)) {
429  VLOG(1) << "Using locally compiled kernel.";
430  return cubin;
431  }
432 
433 # ifdef _WIN32
434  if (!use_adaptive_compilation() && have_precompiled_kernels()) {
435  if (major < 3) {
436  set_error(
437  string_printf("CUDA backend requires compute capability 3.0 or up, but found %d.%d. "
438  "Your GPU is not supported.",
439  major,
440  minor));
441  }
442  else {
443  set_error(
444  string_printf("CUDA binary kernel for this graphics card compute "
445  "capability (%d.%d) not found.",
446  major,
447  minor));
448  }
449  return string();
450  }
451 # endif
452 
453  /* Compile. */
454  const char *const nvcc = cuewCompilerPath();
455  if (nvcc == NULL) {
456  set_error(
457  "CUDA nvcc compiler not found. "
458  "Install CUDA toolkit in default location.");
459  return string();
460  }
461 
462  const int nvcc_cuda_version = cuewCompilerVersion();
463  VLOG(1) << "Found nvcc " << nvcc << ", CUDA version " << nvcc_cuda_version << ".";
464  if (nvcc_cuda_version < 101) {
465  printf(
466  "Unsupported CUDA version %d.%d detected, "
467  "you need CUDA 10.1 or newer.\n",
468  nvcc_cuda_version / 10,
469  nvcc_cuda_version % 10);
470  return string();
471  }
472  else if (!(nvcc_cuda_version == 101 || nvcc_cuda_version == 102 || nvcc_cuda_version == 111 ||
473  nvcc_cuda_version == 112 || nvcc_cuda_version == 113 || nvcc_cuda_version == 114)) {
474  printf(
475  "CUDA version %d.%d detected, build may succeed but only "
476  "CUDA 10.1 to 11.4 are officially supported.\n",
477  nvcc_cuda_version / 10,
478  nvcc_cuda_version % 10);
479  }
480 
481  double starttime = time_dt();
482 
484 
485  source_path = path_join(path_join(source_path, "kernel"),
486  path_join("kernels", path_join(base, string_printf("%s.cu", name))));
487 
488  string command = string_printf(
489  "\"%s\" "
490  "-arch=%s_%d%d "
491  "--%s \"%s\" "
492  "-o \"%s\" "
493  "%s",
494  nvcc,
495  kernel_arch,
496  major,
497  minor,
498  kernel_ext,
499  source_path.c_str(),
500  cubin.c_str(),
501  common_cflags.c_str());
502 
503  printf("Compiling CUDA kernel ...\n%s\n", command.c_str());
504 
505 # ifdef _WIN32
506  command = "call " + command;
507 # endif
508  if (system(command.c_str()) != 0) {
509  set_error(
510  "Failed to execute compilation command, "
511  "see console for details.");
512  return string();
513  }
514 
515  /* Verify if compilation succeeded */
516  if (!path_exists(cubin)) {
517  set_error(
518  "CUDA kernel compilation failed, "
519  "see console for details.");
520  return string();
521  }
522 
523  printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime);
524 
525  return cubin;
526 }
527 
528 bool CUDADevice::load_kernels(const DeviceRequestedFeatures &requested_features)
529 {
530  /* TODO(sergey): Support kernels re-load for CUDA devices.
531  *
532  * Currently re-loading kernel will invalidate memory pointers,
533  * causing problems in cuCtxSynchronize.
534  */
535  if (cuFilterModule && cuModule) {
536  VLOG(1) << "Skipping kernel reload, not currently supported.";
537  return true;
538  }
539 
540  /* check if cuda init succeeded */
541  if (cuContext == 0)
542  return false;
543 
544  /* check if GPU is supported */
545  if (!support_device(requested_features))
546  return false;
547 
548  /* get kernel */
549  const char *kernel_name = use_split_kernel() ? "kernel_split" : "kernel";
550  string cubin = compile_kernel(requested_features, kernel_name);
551  if (cubin.empty())
552  return false;
553 
554  const char *filter_name = "filter";
555  string filter_cubin = compile_kernel(requested_features, filter_name);
556  if (filter_cubin.empty())
557  return false;
558 
559  /* open module */
560  CUDAContextScope scope(this);
561 
562  string cubin_data;
563  CUresult result;
564 
565  if (path_read_text(cubin, cubin_data))
566  result = cuModuleLoadData(&cuModule, cubin_data.c_str());
567  else
568  result = CUDA_ERROR_FILE_NOT_FOUND;
569 
570  if (result != CUDA_SUCCESS)
571  set_error(string_printf(
572  "Failed to load CUDA kernel from '%s' (%s)", cubin.c_str(), cuewErrorString(result)));
573 
574  if (path_read_text(filter_cubin, cubin_data))
575  result = cuModuleLoadData(&cuFilterModule, cubin_data.c_str());
576  else
577  result = CUDA_ERROR_FILE_NOT_FOUND;
578 
579  if (result != CUDA_SUCCESS)
580  set_error(string_printf("Failed to load CUDA kernel from '%s' (%s)",
581  filter_cubin.c_str(),
582  cuewErrorString(result)));
583 
584  if (result == CUDA_SUCCESS) {
585  reserve_local_memory(requested_features);
586 
587  load_functions();
588  }
589 
590  return (result == CUDA_SUCCESS);
591 }
592 
593 void CUDADevice::load_functions()
594 {
595  /* TODO: load all functions here. */
596  if (functions.loaded) {
597  return;
598  }
599  functions.loaded = true;
600 
601  cuda_assert(cuModuleGetFunction(
602  &functions.adaptive_stopping, cuModule, "kernel_cuda_adaptive_stopping"));
603  cuda_assert(cuModuleGetFunction(
604  &functions.adaptive_filter_x, cuModule, "kernel_cuda_adaptive_filter_x"));
605  cuda_assert(cuModuleGetFunction(
606  &functions.adaptive_filter_y, cuModule, "kernel_cuda_adaptive_filter_y"));
607  cuda_assert(cuModuleGetFunction(
608  &functions.adaptive_scale_samples, cuModule, "kernel_cuda_adaptive_scale_samples"));
609 
610  cuda_assert(cuFuncSetCacheConfig(functions.adaptive_stopping, CU_FUNC_CACHE_PREFER_L1));
611  cuda_assert(cuFuncSetCacheConfig(functions.adaptive_filter_x, CU_FUNC_CACHE_PREFER_L1));
612  cuda_assert(cuFuncSetCacheConfig(functions.adaptive_filter_y, CU_FUNC_CACHE_PREFER_L1));
613  cuda_assert(cuFuncSetCacheConfig(functions.adaptive_scale_samples, CU_FUNC_CACHE_PREFER_L1));
614 
615  int unused_min_blocks;
616  cuda_assert(cuOccupancyMaxPotentialBlockSize(&unused_min_blocks,
617  &functions.adaptive_num_threads_per_block,
618  functions.adaptive_scale_samples,
619  NULL,
620  0,
621  0));
622 }
623 
624 void CUDADevice::reserve_local_memory(const DeviceRequestedFeatures &requested_features)
625 {
626  if (use_split_kernel()) {
627  /* Split kernel mostly uses global memory and adaptive compilation,
628  * difficult to predict how much is needed currently. */
629  return;
630  }
631 
632  /* Together with CU_CTX_LMEM_RESIZE_TO_MAX, this reserves local memory
633  * needed for kernel launches, so that we can reliably figure out when
634  * to allocate scene data in mapped host memory. */
635  CUDAContextScope scope(this);
636 
637  size_t total = 0, free_before = 0, free_after = 0;
638  cuMemGetInfo(&free_before, &total);
639 
640  /* Get kernel function. */
641  CUfunction cuRender;
642 
643  if (requested_features.use_baking) {
644  cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_bake"));
645  }
646  else if (requested_features.use_integrator_branched) {
647  cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_branched_path_trace"));
648  }
649  else {
650  cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_path_trace"));
651  }
652 
653  cuda_assert(cuFuncSetCacheConfig(cuRender, CU_FUNC_CACHE_PREFER_L1));
654 
655  int min_blocks, num_threads_per_block;
656  cuda_assert(
657  cuOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, cuRender, NULL, 0, 0));
658 
659  /* Launch kernel, using just 1 block appears sufficient to reserve
660  * memory for all multiprocessors. It would be good to do this in
661  * parallel for the multi GPU case still to make it faster. */
662  CUdeviceptr d_work_tiles = 0;
663  uint total_work_size = 0;
664 
665  void *args[] = {&d_work_tiles, &total_work_size};
666 
667  cuda_assert(cuLaunchKernel(cuRender, 1, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0));
668 
669  cuda_assert(cuCtxSynchronize());
670 
671  cuMemGetInfo(&free_after, &total);
672  VLOG(1) << "Local memory reserved " << string_human_readable_number(free_before - free_after)
673  << " bytes. (" << string_human_readable_size(free_before - free_after) << ")";
674 
675 # if 0
676  /* For testing mapped host memory, fill up device memory. */
677  const size_t keep_mb = 1024;
678 
679  while (free_after > keep_mb * 1024 * 1024LL) {
680  CUdeviceptr tmp;
681  cuda_assert(cuMemAlloc(&tmp, 10 * 1024 * 1024LL));
682  cuMemGetInfo(&free_after, &total);
683  }
684 # endif
685 }
686 
687 void CUDADevice::init_host_memory()
688 {
689  /* Limit amount of host mapped memory, because allocating too much can
690  * cause system instability. Leave at least half or 4 GB of system
691  * memory free, whichever is smaller. */
692  size_t default_limit = 4 * 1024 * 1024 * 1024LL;
693  size_t system_ram = system_physical_ram();
694 
695  if (system_ram > 0) {
696  if (system_ram / 2 > default_limit) {
697  map_host_limit = system_ram - default_limit;
698  }
699  else {
700  map_host_limit = system_ram / 2;
701  }
702  }
703  else {
704  VLOG(1) << "Mapped host memory disabled, failed to get system RAM";
705  map_host_limit = 0;
706  }
707 
708  /* Amount of device memory to keep is free after texture memory
709  * and working memory allocations respectively. We set the working
710  * memory limit headroom lower so that some space is left after all
711  * texture memory allocations. */
712  device_working_headroom = 32 * 1024 * 1024LL; // 32MB
713  device_texture_headroom = 128 * 1024 * 1024LL; // 128MB
714 
715  VLOG(1) << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit)
716  << " bytes. (" << string_human_readable_size(map_host_limit) << ")";
717 }
718 
719 void CUDADevice::load_texture_info()
720 {
721  if (need_texture_info) {
722  /* Unset flag before copying, so this does not loop indefinitely if the copy below calls
723  * into 'move_textures_to_host' (which calls 'load_texture_info' again). */
724  need_texture_info = false;
725  texture_info.copy_to_device();
726  }
727 }
728 
729 void CUDADevice::move_textures_to_host(size_t size, bool for_texture)
730 {
731  /* Break out of recursive call, which can happen when moving memory on a multi device. */
732  static bool any_device_moving_textures_to_host = false;
733  if (any_device_moving_textures_to_host) {
734  return;
735  }
736 
737  /* Signal to reallocate textures in host memory only. */
738  move_texture_to_host = true;
739 
740  while (size > 0) {
741  /* Find suitable memory allocation to move. */
742  device_memory *max_mem = NULL;
743  size_t max_size = 0;
744  bool max_is_image = false;
745 
746  thread_scoped_lock lock(cuda_mem_map_mutex);
747  foreach (CUDAMemMap::value_type &pair, cuda_mem_map) {
748  device_memory &mem = *pair.first;
749  CUDAMem *cmem = &pair.second;
750 
751  /* Can only move textures allocated on this device (and not those from peer devices).
752  * And need to ignore memory that is already on the host. */
753  if (!mem.is_resident(this) || cmem->use_mapped_host) {
754  continue;
755  }
756 
757  bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) &&
758  (&mem != &texture_info);
759  bool is_image = is_texture && (mem.data_height > 1);
760 
761  /* Can't move this type of memory. */
762  if (!is_texture || cmem->array) {
763  continue;
764  }
765 
766  /* For other textures, only move image textures. */
767  if (for_texture && !is_image) {
768  continue;
769  }
770 
771  /* Try to move largest allocation, prefer moving images. */
772  if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) {
773  max_is_image = is_image;
774  max_size = mem.device_size;
775  max_mem = &mem;
776  }
777  }
778  lock.unlock();
779 
780  /* Move to host memory. This part is mutex protected since
781  * multiple CUDA devices could be moving the memory. The
782  * first one will do it, and the rest will adopt the pointer. */
783  if (max_mem) {
784  VLOG(1) << "Move memory from device to host: " << max_mem->name;
785 
786  static thread_mutex move_mutex;
787  thread_scoped_lock lock(move_mutex);
788 
789  any_device_moving_textures_to_host = true;
790 
791  /* Potentially need to call back into multi device, so pointer mapping
792  * and peer devices are updated. This is also necessary since the device
793  * pointer may just be a key here, so cannot be accessed and freed directly.
794  * Unfortunately it does mean that memory is reallocated on all other
795  * devices as well, which is potentially dangerous when still in use (since
796  * a thread rendering on another devices would only be caught in this mutex
797  * if it so happens to do an allocation at the same time as well. */
798  max_mem->device_copy_to();
799  size = (max_size >= size) ? 0 : size - max_size;
800 
801  any_device_moving_textures_to_host = false;
802  }
803  else {
804  break;
805  }
806  }
807 
808  /* Unset flag before texture info is reloaded, since it should stay in device memory. */
809  move_texture_to_host = false;
810 
811  /* Update texture info array with new pointers. */
812  load_texture_info();
813 }
814 
815 CUDADevice::CUDAMem *CUDADevice::generic_alloc(device_memory &mem, size_t pitch_padding)
816 {
817  CUDAContextScope scope(this);
818 
819  CUdeviceptr device_pointer = 0;
820  size_t size = mem.memory_size() + pitch_padding;
821 
822  CUresult mem_alloc_result = CUDA_ERROR_OUT_OF_MEMORY;
823  const char *status = "";
824 
825  /* First try allocating in device memory, respecting headroom. We make
826  * an exception for texture info. It is small and frequently accessed,
827  * so treat it as working memory.
828  *
829  * If there is not enough room for working memory, we will try to move
830  * textures to host memory, assuming the performance impact would have
831  * been worse for working memory. */
832  bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) && (&mem != &texture_info);
833  bool is_image = is_texture && (mem.data_height > 1);
834 
835  size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom;
836 
837  size_t total = 0, free = 0;
838  cuMemGetInfo(&free, &total);
839 
840  /* Move textures to host memory if needed. */
841  if (!move_texture_to_host && !is_image && (size + headroom) >= free && can_map_host) {
842  move_textures_to_host(size + headroom - free, is_texture);
843  cuMemGetInfo(&free, &total);
844  }
845 
846  /* Allocate in device memory. */
847  if (!move_texture_to_host && (size + headroom) < free) {
848  mem_alloc_result = cuMemAlloc(&device_pointer, size);
849  if (mem_alloc_result == CUDA_SUCCESS) {
850  status = " in device memory";
851  }
852  }
853 
854  /* Fall back to mapped host memory if needed and possible. */
855 
856  void *shared_pointer = 0;
857 
858  if (mem_alloc_result != CUDA_SUCCESS && can_map_host && mem.type != MEM_DEVICE_ONLY) {
859  if (mem.shared_pointer) {
860  /* Another device already allocated host memory. */
861  mem_alloc_result = CUDA_SUCCESS;
862  shared_pointer = mem.shared_pointer;
863  }
864  else if (map_host_used + size < map_host_limit) {
865  /* Allocate host memory ourselves. */
866  mem_alloc_result = cuMemHostAlloc(
867  &shared_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED);
868 
869  assert((mem_alloc_result == CUDA_SUCCESS && shared_pointer != 0) ||
870  (mem_alloc_result != CUDA_SUCCESS && shared_pointer == 0));
871  }
872 
873  if (mem_alloc_result == CUDA_SUCCESS) {
874  cuda_assert(cuMemHostGetDevicePointer_v2(&device_pointer, shared_pointer, 0));
875  map_host_used += size;
876  status = " in host memory";
877  }
878  }
879 
880  if (mem_alloc_result != CUDA_SUCCESS) {
881  if (mem.type == MEM_DEVICE_ONLY) {
882  status = " failed, out of device memory";
883  set_error("System is out of GPU memory");
884  }
885  else {
886  status = " failed, out of device and host memory";
887  set_error("System is out of GPU and shared host memory");
888  }
889  }
890 
891  if (mem.name) {
892  VLOG(1) << "Buffer allocate: " << mem.name << ", "
893  << string_human_readable_number(mem.memory_size()) << " bytes. ("
894  << string_human_readable_size(mem.memory_size()) << ")" << status;
895  }
896 
897  mem.device_pointer = (device_ptr)device_pointer;
898  mem.device_size = size;
899  stats.mem_alloc(size);
900 
901  if (!mem.device_pointer) {
902  return NULL;
903  }
904 
905  /* Insert into map of allocations. */
906  thread_scoped_lock lock(cuda_mem_map_mutex);
907  CUDAMem *cmem = &cuda_mem_map[&mem];
908  if (shared_pointer != 0) {
909  /* Replace host pointer with our host allocation. Only works if
910  * CUDA memory layout is the same and has no pitch padding. Also
911  * does not work if we move textures to host during a render,
912  * since other devices might be using the memory. */
913 
914  if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer &&
915  mem.host_pointer != shared_pointer) {
916  memcpy(shared_pointer, mem.host_pointer, size);
917 
918  /* A Call to device_memory::host_free() should be preceded by
919  * a call to device_memory::device_free() for host memory
920  * allocated by a device to be handled properly. Two exceptions
921  * are here and a call in OptiXDevice::generic_alloc(), where
922  * the current host memory can be assumed to be allocated by
923  * device_memory::host_alloc(), not by a device */
924 
925  mem.host_free();
926  mem.host_pointer = shared_pointer;
927  }
928  mem.shared_pointer = shared_pointer;
929  mem.shared_counter++;
930  cmem->use_mapped_host = true;
931  }
932  else {
933  cmem->use_mapped_host = false;
934  }
935 
936  return cmem;
937 }
938 
939 void CUDADevice::generic_copy_to(device_memory &mem)
940 {
941  if (!mem.host_pointer || !mem.device_pointer) {
942  return;
943  }
944 
945  /* If use_mapped_host of mem is false, the current device only uses device memory allocated by
946  * cuMemAlloc regardless of mem.host_pointer and mem.shared_pointer, and should copy data from
947  * mem.host_pointer. */
948  thread_scoped_lock lock(cuda_mem_map_mutex);
949  if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
950  const CUDAContextScope scope(this);
951  cuda_assert(
952  cuMemcpyHtoD((CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.memory_size()));
953  }
954 }
955 
956 void CUDADevice::generic_free(device_memory &mem)
957 {
958  if (mem.device_pointer) {
959  CUDAContextScope scope(this);
960  thread_scoped_lock lock(cuda_mem_map_mutex);
961  const CUDAMem &cmem = cuda_mem_map[&mem];
962 
963  /* If cmem.use_mapped_host is true, reference counting is used
964  * to safely free a mapped host memory. */
965 
966  if (cmem.use_mapped_host) {
967  assert(mem.shared_pointer);
968  if (mem.shared_pointer) {
969  assert(mem.shared_counter > 0);
970  if (--mem.shared_counter == 0) {
971  if (mem.host_pointer == mem.shared_pointer) {
972  mem.host_pointer = 0;
973  }
974  cuMemFreeHost(mem.shared_pointer);
975  mem.shared_pointer = 0;
976  }
977  }
978  map_host_used -= mem.device_size;
979  }
980  else {
981  /* Free device memory. */
982  cuda_assert(cuMemFree(mem.device_pointer));
983  }
984 
985  stats.mem_free(mem.device_size);
986  mem.device_pointer = 0;
987  mem.device_size = 0;
988 
989  cuda_mem_map.erase(cuda_mem_map.find(&mem));
990  }
991 }
992 
993 void CUDADevice::mem_alloc(device_memory &mem)
994 {
995  if (mem.type == MEM_PIXELS && !background) {
996  pixels_alloc(mem);
997  }
998  else if (mem.type == MEM_TEXTURE) {
999  assert(!"mem_alloc not supported for textures.");
1000  }
1001  else if (mem.type == MEM_GLOBAL) {
1002  assert(!"mem_alloc not supported for global memory.");
1003  }
1004  else {
1005  generic_alloc(mem);
1006  }
1007 }
1008 
1009 void CUDADevice::mem_copy_to(device_memory &mem)
1010 {
1011  if (mem.type == MEM_PIXELS) {
1012  assert(!"mem_copy_to not supported for pixels.");
1013  }
1014  else if (mem.type == MEM_GLOBAL) {
1015  global_free(mem);
1016  global_alloc(mem);
1017  }
1018  else if (mem.type == MEM_TEXTURE) {
1019  tex_free((device_texture &)mem);
1020  tex_alloc((device_texture &)mem);
1021  }
1022  else {
1023  if (!mem.device_pointer) {
1024  generic_alloc(mem);
1025  }
1026  generic_copy_to(mem);
1027  }
1028 }
1029 
1030 void CUDADevice::mem_copy_from(device_memory &mem, int y, int w, int h, int elem)
1031 {
1032  if (mem.type == MEM_PIXELS && !background) {
1033  pixels_copy_from(mem, y, w, h);
1034  }
1035  else if (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) {
1036  assert(!"mem_copy_from not supported for textures.");
1037  }
1038  else if (mem.host_pointer) {
1039  const size_t size = elem * w * h;
1040  const size_t offset = elem * y * w;
1041 
1042  if (mem.device_pointer) {
1043  const CUDAContextScope scope(this);
1044  cuda_assert(cuMemcpyDtoH(
1045  (char *)mem.host_pointer + offset, (CUdeviceptr)mem.device_pointer + offset, size));
1046  }
1047  else {
1048  memset((char *)mem.host_pointer + offset, 0, size);
1049  }
1050  }
1051 }
1052 
1053 void CUDADevice::mem_zero(device_memory &mem)
1054 {
1055  if (!mem.device_pointer) {
1056  mem_alloc(mem);
1057  }
1058  if (!mem.device_pointer) {
1059  return;
1060  }
1061 
1062  /* If use_mapped_host of mem is false, mem.device_pointer currently refers to device memory
1063  * regardless of mem.host_pointer and mem.shared_pointer. */
1064  thread_scoped_lock lock(cuda_mem_map_mutex);
1065  if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
1066  const CUDAContextScope scope(this);
1067  cuda_assert(cuMemsetD8((CUdeviceptr)mem.device_pointer, 0, mem.memory_size()));
1068  }
1069  else if (mem.host_pointer) {
1070  memset(mem.host_pointer, 0, mem.memory_size());
1071  }
1072 }
1073 
1074 void CUDADevice::mem_free(device_memory &mem)
1075 {
1076  if (mem.type == MEM_PIXELS && !background) {
1077  pixels_free(mem);
1078  }
1079  else if (mem.type == MEM_GLOBAL) {
1080  global_free(mem);
1081  }
1082  else if (mem.type == MEM_TEXTURE) {
1083  tex_free((device_texture &)mem);
1084  }
1085  else {
1086  generic_free(mem);
1087  }
1088 }
1089 
1090 device_ptr CUDADevice::mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/)
1091 {
1092  return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset));
1093 }
1094 
1095 void CUDADevice::const_copy_to(const char *name, void *host, size_t size)
1096 {
1097  CUDAContextScope scope(this);
1098  CUdeviceptr mem;
1099  size_t bytes;
1100 
1101  cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, name));
1102  // assert(bytes == size);
1103  cuda_assert(cuMemcpyHtoD(mem, host, size));
1104 }
1105 
1106 void CUDADevice::global_alloc(device_memory &mem)
1107 {
1108  if (mem.is_resident(this)) {
1109  generic_alloc(mem);
1110  generic_copy_to(mem);
1111  }
1112 
1113  const_copy_to(mem.name, &mem.device_pointer, sizeof(mem.device_pointer));
1114 }
1115 
1116 void CUDADevice::global_free(device_memory &mem)
1117 {
1118  if (mem.is_resident(this) && mem.device_pointer) {
1119  generic_free(mem);
1120  }
1121 }
1122 
1123 void CUDADevice::tex_alloc(device_texture &mem)
1124 {
1125  CUDAContextScope scope(this);
1126 
1127  /* General variables for both architectures */
1128  string bind_name = mem.name;
1129  size_t dsize = datatype_size(mem.data_type);
1130  size_t size = mem.memory_size();
1131 
1132  CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP;
1133  switch (mem.info.extension) {
1134  case EXTENSION_REPEAT:
1135  address_mode = CU_TR_ADDRESS_MODE_WRAP;
1136  break;
1137  case EXTENSION_EXTEND:
1138  address_mode = CU_TR_ADDRESS_MODE_CLAMP;
1139  break;
1140  case EXTENSION_CLIP:
1141  address_mode = CU_TR_ADDRESS_MODE_BORDER;
1142  break;
1143  default:
1144  assert(0);
1145  break;
1146  }
1147 
1148  CUfilter_mode filter_mode;
1150  filter_mode = CU_TR_FILTER_MODE_POINT;
1151  }
1152  else {
1153  filter_mode = CU_TR_FILTER_MODE_LINEAR;
1154  }
1155 
1156  /* Image Texture Storage */
1157  CUarray_format_enum format;
1158  switch (mem.data_type) {
1159  case TYPE_UCHAR:
1160  format = CU_AD_FORMAT_UNSIGNED_INT8;
1161  break;
1162  case TYPE_UINT16:
1163  format = CU_AD_FORMAT_UNSIGNED_INT16;
1164  break;
1165  case TYPE_UINT:
1166  format = CU_AD_FORMAT_UNSIGNED_INT32;
1167  break;
1168  case TYPE_INT:
1169  format = CU_AD_FORMAT_SIGNED_INT32;
1170  break;
1171  case TYPE_FLOAT:
1172  format = CU_AD_FORMAT_FLOAT;
1173  break;
1174  case TYPE_HALF:
1175  format = CU_AD_FORMAT_HALF;
1176  break;
1177  default:
1178  assert(0);
1179  return;
1180  }
1181 
1182  CUDAMem *cmem = NULL;
1183  CUarray array_3d = NULL;
1184  size_t src_pitch = mem.data_width * dsize * mem.data_elements;
1185  size_t dst_pitch = src_pitch;
1186 
1187  if (!mem.is_resident(this)) {
1188  thread_scoped_lock lock(cuda_mem_map_mutex);
1189  cmem = &cuda_mem_map[&mem];
1190  cmem->texobject = 0;
1191 
1192  if (mem.data_depth > 1) {
1193  array_3d = (CUarray)mem.device_pointer;
1194  cmem->array = array_3d;
1195  }
1196  else if (mem.data_height > 0) {
1197  dst_pitch = align_up(src_pitch, pitch_alignment);
1198  }
1199  }
1200  else if (mem.data_depth > 1) {
1201  /* 3D texture using array, there is no API for linear memory. */
1202  CUDA_ARRAY3D_DESCRIPTOR desc;
1203 
1204  desc.Width = mem.data_width;
1205  desc.Height = mem.data_height;
1206  desc.Depth = mem.data_depth;
1207  desc.Format = format;
1208  desc.NumChannels = mem.data_elements;
1209  desc.Flags = 0;
1210 
1211  VLOG(1) << "Array 3D allocate: " << mem.name << ", "
1212  << string_human_readable_number(mem.memory_size()) << " bytes. ("
1213  << string_human_readable_size(mem.memory_size()) << ")";
1214 
1215  cuda_assert(cuArray3DCreate(&array_3d, &desc));
1216 
1217  if (!array_3d) {
1218  return;
1219  }
1220 
1221  CUDA_MEMCPY3D param;
1222  memset(&param, 0, sizeof(param));
1223  param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
1224  param.dstArray = array_3d;
1225  param.srcMemoryType = CU_MEMORYTYPE_HOST;
1226  param.srcHost = mem.host_pointer;
1227  param.srcPitch = src_pitch;
1228  param.WidthInBytes = param.srcPitch;
1229  param.Height = mem.data_height;
1230  param.Depth = mem.data_depth;
1231 
1232  cuda_assert(cuMemcpy3D(&param));
1233 
1234  mem.device_pointer = (device_ptr)array_3d;
1235  mem.device_size = size;
1236  stats.mem_alloc(size);
1237 
1238  thread_scoped_lock lock(cuda_mem_map_mutex);
1239  cmem = &cuda_mem_map[&mem];
1240  cmem->texobject = 0;
1241  cmem->array = array_3d;
1242  }
1243  else if (mem.data_height > 0) {
1244  /* 2D texture, using pitch aligned linear memory. */
1245  dst_pitch = align_up(src_pitch, pitch_alignment);
1246  size_t dst_size = dst_pitch * mem.data_height;
1247 
1248  cmem = generic_alloc(mem, dst_size - mem.memory_size());
1249  if (!cmem) {
1250  return;
1251  }
1252 
1253  CUDA_MEMCPY2D param;
1254  memset(&param, 0, sizeof(param));
1255  param.dstMemoryType = CU_MEMORYTYPE_DEVICE;
1256  param.dstDevice = mem.device_pointer;
1257  param.dstPitch = dst_pitch;
1258  param.srcMemoryType = CU_MEMORYTYPE_HOST;
1259  param.srcHost = mem.host_pointer;
1260  param.srcPitch = src_pitch;
1261  param.WidthInBytes = param.srcPitch;
1262  param.Height = mem.data_height;
1263 
1264  cuda_assert(cuMemcpy2DUnaligned(&param));
1265  }
1266  else {
1267  /* 1D texture, using linear memory. */
1268  cmem = generic_alloc(mem);
1269  if (!cmem) {
1270  return;
1271  }
1272 
1273  cuda_assert(cuMemcpyHtoD(mem.device_pointer, mem.host_pointer, size));
1274  }
1275 
1276  /* Resize once */
1277  const uint slot = mem.slot;
1278  if (slot >= texture_info.size()) {
1279  /* Allocate some slots in advance, to reduce amount
1280  * of re-allocations. */
1281  texture_info.resize(slot + 128);
1282  }
1283 
1284  /* Set Mapping and tag that we need to (re-)upload to device */
1285  texture_info[slot] = mem.info;
1286  need_texture_info = true;
1287 
1290  /* Kepler+, bindless textures. */
1291  CUDA_RESOURCE_DESC resDesc;
1292  memset(&resDesc, 0, sizeof(resDesc));
1293 
1294  if (array_3d) {
1295  resDesc.resType = CU_RESOURCE_TYPE_ARRAY;
1296  resDesc.res.array.hArray = array_3d;
1297  resDesc.flags = 0;
1298  }
1299  else if (mem.data_height > 0) {
1300  resDesc.resType = CU_RESOURCE_TYPE_PITCH2D;
1301  resDesc.res.pitch2D.devPtr = mem.device_pointer;
1302  resDesc.res.pitch2D.format = format;
1303  resDesc.res.pitch2D.numChannels = mem.data_elements;
1304  resDesc.res.pitch2D.height = mem.data_height;
1305  resDesc.res.pitch2D.width = mem.data_width;
1306  resDesc.res.pitch2D.pitchInBytes = dst_pitch;
1307  }
1308  else {
1309  resDesc.resType = CU_RESOURCE_TYPE_LINEAR;
1310  resDesc.res.linear.devPtr = mem.device_pointer;
1311  resDesc.res.linear.format = format;
1312  resDesc.res.linear.numChannels = mem.data_elements;
1313  resDesc.res.linear.sizeInBytes = mem.device_size;
1314  }
1315 
1316  CUDA_TEXTURE_DESC texDesc;
1317  memset(&texDesc, 0, sizeof(texDesc));
1318  texDesc.addressMode[0] = address_mode;
1319  texDesc.addressMode[1] = address_mode;
1320  texDesc.addressMode[2] = address_mode;
1321  texDesc.filterMode = filter_mode;
1322  texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;
1323 
1324  thread_scoped_lock lock(cuda_mem_map_mutex);
1325  cmem = &cuda_mem_map[&mem];
1326 
1327  cuda_assert(cuTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL));
1328 
1329  texture_info[slot].data = (uint64_t)cmem->texobject;
1330  }
1331  else {
1332  texture_info[slot].data = (uint64_t)mem.device_pointer;
1333  }
1334 }
1335 
1336 void CUDADevice::tex_free(device_texture &mem)
1337 {
1338  if (mem.device_pointer) {
1339  CUDAContextScope scope(this);
1340  thread_scoped_lock lock(cuda_mem_map_mutex);
1341  const CUDAMem &cmem = cuda_mem_map[&mem];
1342 
1343  if (cmem.texobject) {
1344  /* Free bindless texture. */
1345  cuTexObjectDestroy(cmem.texobject);
1346  }
1347 
1348  if (!mem.is_resident(this)) {
1349  /* Do not free memory here, since it was allocated on a different device. */
1350  cuda_mem_map.erase(cuda_mem_map.find(&mem));
1351  }
1352  else if (cmem.array) {
1353  /* Free array. */
1354  cuArrayDestroy(cmem.array);
1355  stats.mem_free(mem.device_size);
1356  mem.device_pointer = 0;
1357  mem.device_size = 0;
1358 
1359  cuda_mem_map.erase(cuda_mem_map.find(&mem));
1360  }
1361  else {
1362  lock.unlock();
1363  generic_free(mem);
1364  }
1365  }
1366 }
1367 
1368 # define CUDA_GET_BLOCKSIZE(func, w, h) \
1369  int threads_per_block; \
1370  cuda_assert( \
1371  cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
1372  int threads = (int)sqrt((float)threads_per_block); \
1373  int xblocks = ((w) + threads - 1) / threads; \
1374  int yblocks = ((h) + threads - 1) / threads;
1375 
1376 # define CUDA_LAUNCH_KERNEL(func, args) \
1377  cuda_assert(cuLaunchKernel(func, xblocks, yblocks, 1, threads, threads, 1, 0, 0, args, 0));
1378 
1379 /* Similar as above, but for 1-dimensional blocks. */
1380 # define CUDA_GET_BLOCKSIZE_1D(func, w, h) \
1381  int threads_per_block; \
1382  cuda_assert( \
1383  cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
1384  int xblocks = ((w) + threads_per_block - 1) / threads_per_block; \
1385  int yblocks = h;
1386 
1387 # define CUDA_LAUNCH_KERNEL_1D(func, args) \
1388  cuda_assert(cuLaunchKernel(func, xblocks, yblocks, 1, threads_per_block, 1, 1, 0, 0, args, 0));
1389 
1390 bool CUDADevice::denoising_non_local_means(device_ptr image_ptr,
1391  device_ptr guide_ptr,
1392  device_ptr variance_ptr,
1393  device_ptr out_ptr,
1395 {
1396  if (have_error())
1397  return false;
1398 
1399  CUDAContextScope scope(this);
1400 
1401  int stride = task->buffer.stride;
1402  int w = task->buffer.width;
1403  int h = task->buffer.h;
1404  int r = task->nlm_state.r;
1405  int f = task->nlm_state.f;
1406  float a = task->nlm_state.a;
1407  float k_2 = task->nlm_state.k_2;
1408 
1409  int pass_stride = task->buffer.pass_stride;
1410  int num_shifts = (2 * r + 1) * (2 * r + 1);
1411  int channel_offset = task->nlm_state.is_color ? task->buffer.pass_stride : 0;
1412  int frame_offset = 0;
1413 
1414  if (have_error())
1415  return false;
1416 
1417  CUdeviceptr difference = (CUdeviceptr)task->buffer.temporary_mem.device_pointer;
1418  CUdeviceptr blurDifference = difference + sizeof(float) * pass_stride * num_shifts;
1419  CUdeviceptr weightAccum = difference + 2 * sizeof(float) * pass_stride * num_shifts;
1420  CUdeviceptr scale_ptr = 0;
1421 
1422  cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float) * pass_stride));
1423  cuda_assert(cuMemsetD8(out_ptr, 0, sizeof(float) * pass_stride));
1424 
1425  {
1426  CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMUpdateOutput;
1427  cuda_assert(cuModuleGetFunction(
1428  &cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
1429  cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur"));
1430  cuda_assert(cuModuleGetFunction(
1431  &cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
1432  cuda_assert(cuModuleGetFunction(
1433  &cuNLMUpdateOutput, cuFilterModule, "kernel_cuda_filter_nlm_update_output"));
1434 
1435  cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
1436  cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
1437  cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
1438  cuda_assert(cuFuncSetCacheConfig(cuNLMUpdateOutput, CU_FUNC_CACHE_PREFER_L1));
1439 
1440  CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, w * h, num_shifts);
1441 
1442  void *calc_difference_args[] = {&guide_ptr,
1443  &variance_ptr,
1444  &scale_ptr,
1445  &difference,
1446  &w,
1447  &h,
1448  &stride,
1449  &pass_stride,
1450  &r,
1451  &channel_offset,
1452  &frame_offset,
1453  &a,
1454  &k_2};
1455  void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
1456  void *calc_weight_args[] = {
1457  &blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
1458  void *update_output_args[] = {&blurDifference,
1459  &image_ptr,
1460  &out_ptr,
1461  &weightAccum,
1462  &w,
1463  &h,
1464  &stride,
1465  &pass_stride,
1466  &channel_offset,
1467  &r,
1468  &f};
1469 
1470  CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
1471  CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
1472  CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args);
1473  CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
1474  CUDA_LAUNCH_KERNEL_1D(cuNLMUpdateOutput, update_output_args);
1475  }
1476 
1477  {
1478  CUfunction cuNLMNormalize;
1479  cuda_assert(
1480  cuModuleGetFunction(&cuNLMNormalize, cuFilterModule, "kernel_cuda_filter_nlm_normalize"));
1481  cuda_assert(cuFuncSetCacheConfig(cuNLMNormalize, CU_FUNC_CACHE_PREFER_L1));
1482  void *normalize_args[] = {&out_ptr, &weightAccum, &w, &h, &stride};
1483  CUDA_GET_BLOCKSIZE(cuNLMNormalize, w, h);
1484  CUDA_LAUNCH_KERNEL(cuNLMNormalize, normalize_args);
1485  cuda_assert(cuCtxSynchronize());
1486  }
1487 
1488  return !have_error();
1489 }
1490 
1491 bool CUDADevice::denoising_construct_transform(DenoisingTask *task)
1492 {
1493  if (have_error())
1494  return false;
1495 
1496  CUDAContextScope scope(this);
1497 
1498  CUfunction cuFilterConstructTransform;
1499  cuda_assert(cuModuleGetFunction(
1500  &cuFilterConstructTransform, cuFilterModule, "kernel_cuda_filter_construct_transform"));
1501  cuda_assert(cuFuncSetCacheConfig(cuFilterConstructTransform, CU_FUNC_CACHE_PREFER_SHARED));
1502  CUDA_GET_BLOCKSIZE(cuFilterConstructTransform, task->storage.w, task->storage.h);
1503 
1504  void *args[] = {&task->buffer.mem.device_pointer,
1505  &task->tile_info_mem.device_pointer,
1506  &task->storage.transform.device_pointer,
1507  &task->storage.rank.device_pointer,
1508  &task->filter_area,
1509  &task->rect,
1510  &task->radius,
1511  &task->pca_threshold,
1512  &task->buffer.pass_stride,
1513  &task->buffer.frame_stride,
1514  &task->buffer.use_time};
1515  CUDA_LAUNCH_KERNEL(cuFilterConstructTransform, args);
1516  cuda_assert(cuCtxSynchronize());
1517 
1518  return !have_error();
1519 }
1520 
1521 bool CUDADevice::denoising_accumulate(device_ptr color_ptr,
1522  device_ptr color_variance_ptr,
1523  device_ptr scale_ptr,
1524  int frame,
1526 {
1527  if (have_error())
1528  return false;
1529 
1530  CUDAContextScope scope(this);
1531 
1532  int r = task->radius;
1533  int f = 4;
1534  float a = 1.0f;
1535  float k_2 = task->nlm_k_2;
1536 
1537  int w = task->reconstruction_state.source_w;
1538  int h = task->reconstruction_state.source_h;
1539  int stride = task->buffer.stride;
1540  int frame_offset = frame * task->buffer.frame_stride;
1541  int t = task->tile_info->frames[frame];
1542 
1543  int pass_stride = task->buffer.pass_stride;
1544  int num_shifts = (2 * r + 1) * (2 * r + 1);
1545 
1546  if (have_error())
1547  return false;
1548 
1549  CUdeviceptr difference = (CUdeviceptr)task->buffer.temporary_mem.device_pointer;
1550  CUdeviceptr blurDifference = difference + sizeof(float) * pass_stride * num_shifts;
1551 
1552  CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian;
1553  cuda_assert(cuModuleGetFunction(
1554  &cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
1555  cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur"));
1556  cuda_assert(
1557  cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
1558  cuda_assert(cuModuleGetFunction(
1559  &cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian"));
1560 
1561  cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
1562  cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
1563  cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
1564  cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED));
1565 
1566  CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference,
1567  task->reconstruction_state.source_w * task->reconstruction_state.source_h,
1568  num_shifts);
1569 
1570  void *calc_difference_args[] = {&color_ptr,
1571  &color_variance_ptr,
1572  &scale_ptr,
1573  &difference,
1574  &w,
1575  &h,
1576  &stride,
1577  &pass_stride,
1578  &r,
1579  &pass_stride,
1580  &frame_offset,
1581  &a,
1582  &k_2};
1583  void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
1584  void *calc_weight_args[] = {&blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
1585  void *construct_gramian_args[] = {&t,
1586  &blurDifference,
1587  &task->buffer.mem.device_pointer,
1588  &task->storage.transform.device_pointer,
1589  &task->storage.rank.device_pointer,
1590  &task->storage.XtWX.device_pointer,
1591  &task->storage.XtWY.device_pointer,
1592  &task->reconstruction_state.filter_window,
1593  &w,
1594  &h,
1595  &stride,
1596  &pass_stride,
1597  &r,
1598  &f,
1599  &frame_offset,
1600  &task->buffer.use_time};
1601 
1602  CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
1603  CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
1604  CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args);
1605  CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
1606  CUDA_LAUNCH_KERNEL_1D(cuNLMConstructGramian, construct_gramian_args);
1607  cuda_assert(cuCtxSynchronize());
1608 
1609  return !have_error();
1610 }
1611 
1612 bool CUDADevice::denoising_solve(device_ptr output_ptr, DenoisingTask *task)
1613 {
1614  CUfunction cuFinalize;
1615  cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize"));
1616  cuda_assert(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1));
1617  void *finalize_args[] = {&output_ptr,
1618  &task->storage.rank.device_pointer,
1619  &task->storage.XtWX.device_pointer,
1620  &task->storage.XtWY.device_pointer,
1621  &task->filter_area,
1622  &task->reconstruction_state.buffer_params.x,
1623  &task->render_buffer.samples};
1624  CUDA_GET_BLOCKSIZE(
1625  cuFinalize, task->reconstruction_state.source_w, task->reconstruction_state.source_h);
1626  CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args);
1627  cuda_assert(cuCtxSynchronize());
1628 
1629  return !have_error();
1630 }
1631 
1632 bool CUDADevice::denoising_combine_halves(device_ptr a_ptr,
1633  device_ptr b_ptr,
1634  device_ptr mean_ptr,
1635  device_ptr variance_ptr,
1636  int r,
1637  int4 rect,
1639 {
1640  if (have_error())
1641  return false;
1642 
1643  CUDAContextScope scope(this);
1644 
1645  CUfunction cuFilterCombineHalves;
1646  cuda_assert(cuModuleGetFunction(
1647  &cuFilterCombineHalves, cuFilterModule, "kernel_cuda_filter_combine_halves"));
1648  cuda_assert(cuFuncSetCacheConfig(cuFilterCombineHalves, CU_FUNC_CACHE_PREFER_L1));
1649  CUDA_GET_BLOCKSIZE(
1650  cuFilterCombineHalves, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
1651 
1652  void *args[] = {&mean_ptr, &variance_ptr, &a_ptr, &b_ptr, &rect, &r};
1653  CUDA_LAUNCH_KERNEL(cuFilterCombineHalves, args);
1654  cuda_assert(cuCtxSynchronize());
1655 
1656  return !have_error();
1657 }
1658 
1659 bool CUDADevice::denoising_divide_shadow(device_ptr a_ptr,
1660  device_ptr b_ptr,
1661  device_ptr sample_variance_ptr,
1662  device_ptr sv_variance_ptr,
1663  device_ptr buffer_variance_ptr,
1665 {
1666  if (have_error())
1667  return false;
1668 
1669  CUDAContextScope scope(this);
1670 
1671  CUfunction cuFilterDivideShadow;
1672  cuda_assert(cuModuleGetFunction(
1673  &cuFilterDivideShadow, cuFilterModule, "kernel_cuda_filter_divide_shadow"));
1674  cuda_assert(cuFuncSetCacheConfig(cuFilterDivideShadow, CU_FUNC_CACHE_PREFER_L1));
1675  CUDA_GET_BLOCKSIZE(
1676  cuFilterDivideShadow, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
1677 
1678  void *args[] = {&task->render_buffer.samples,
1679  &task->tile_info_mem.device_pointer,
1680  &a_ptr,
1681  &b_ptr,
1682  &sample_variance_ptr,
1683  &sv_variance_ptr,
1684  &buffer_variance_ptr,
1685  &task->rect,
1686  &task->render_buffer.pass_stride,
1687  &task->render_buffer.offset};
1688  CUDA_LAUNCH_KERNEL(cuFilterDivideShadow, args);
1689  cuda_assert(cuCtxSynchronize());
1690 
1691  return !have_error();
1692 }
1693 
1694 bool CUDADevice::denoising_get_feature(int mean_offset,
1695  int variance_offset,
1696  device_ptr mean_ptr,
1697  device_ptr variance_ptr,
1698  float scale,
1700 {
1701  if (have_error())
1702  return false;
1703 
1704  CUDAContextScope scope(this);
1705 
1706  CUfunction cuFilterGetFeature;
1707  cuda_assert(
1708  cuModuleGetFunction(&cuFilterGetFeature, cuFilterModule, "kernel_cuda_filter_get_feature"));
1709  cuda_assert(cuFuncSetCacheConfig(cuFilterGetFeature, CU_FUNC_CACHE_PREFER_L1));
1710  CUDA_GET_BLOCKSIZE(cuFilterGetFeature, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
1711 
1712  void *args[] = {&task->render_buffer.samples,
1713  &task->tile_info_mem.device_pointer,
1714  &mean_offset,
1715  &variance_offset,
1716  &mean_ptr,
1717  &variance_ptr,
1718  &scale,
1719  &task->rect,
1720  &task->render_buffer.pass_stride,
1721  &task->render_buffer.offset};
1722  CUDA_LAUNCH_KERNEL(cuFilterGetFeature, args);
1723  cuda_assert(cuCtxSynchronize());
1724 
1725  return !have_error();
1726 }
1727 
1728 bool CUDADevice::denoising_write_feature(int out_offset,
1729  device_ptr from_ptr,
1730  device_ptr buffer_ptr,
1732 {
1733  if (have_error())
1734  return false;
1735 
1736  CUDAContextScope scope(this);
1737 
1738  CUfunction cuFilterWriteFeature;
1739  cuda_assert(cuModuleGetFunction(
1740  &cuFilterWriteFeature, cuFilterModule, "kernel_cuda_filter_write_feature"));
1741  cuda_assert(cuFuncSetCacheConfig(cuFilterWriteFeature, CU_FUNC_CACHE_PREFER_L1));
1742  CUDA_GET_BLOCKSIZE(cuFilterWriteFeature, task->filter_area.z, task->filter_area.w);
1743 
1744  void *args[] = {&task->render_buffer.samples,
1745  &task->reconstruction_state.buffer_params,
1746  &task->filter_area,
1747  &from_ptr,
1748  &buffer_ptr,
1749  &out_offset,
1750  &task->rect};
1751  CUDA_LAUNCH_KERNEL(cuFilterWriteFeature, args);
1752  cuda_assert(cuCtxSynchronize());
1753 
1754  return !have_error();
1755 }
1756 
1757 bool CUDADevice::denoising_detect_outliers(device_ptr image_ptr,
1758  device_ptr variance_ptr,
1759  device_ptr depth_ptr,
1760  device_ptr output_ptr,
1762 {
1763  if (have_error())
1764  return false;
1765 
1766  CUDAContextScope scope(this);
1767 
1768  CUfunction cuFilterDetectOutliers;
1769  cuda_assert(cuModuleGetFunction(
1770  &cuFilterDetectOutliers, cuFilterModule, "kernel_cuda_filter_detect_outliers"));
1771  cuda_assert(cuFuncSetCacheConfig(cuFilterDetectOutliers, CU_FUNC_CACHE_PREFER_L1));
1772  CUDA_GET_BLOCKSIZE(
1773  cuFilterDetectOutliers, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
1774 
1775  void *args[] = {
1776  &image_ptr, &variance_ptr, &depth_ptr, &output_ptr, &task->rect, &task->buffer.pass_stride};
1777 
1778  CUDA_LAUNCH_KERNEL(cuFilterDetectOutliers, args);
1779  cuda_assert(cuCtxSynchronize());
1780 
1781  return !have_error();
1782 }
1783 
1784 void CUDADevice::denoise(RenderTile &rtile, DenoisingTask &denoising)
1785 {
1787  &CUDADevice::denoising_construct_transform, this, &denoising);
1788  denoising.functions.accumulate = function_bind(
1789  &CUDADevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising);
1790  denoising.functions.solve = function_bind(&CUDADevice::denoising_solve, this, _1, &denoising);
1792  &CUDADevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
1794  &CUDADevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
1796  &CUDADevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
1797  denoising.functions.get_feature = function_bind(
1798  &CUDADevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising);
1800  &CUDADevice::denoising_write_feature, this, _1, _2, _3, &denoising);
1802  &CUDADevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
1803 
1804  denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
1805  denoising.render_buffer.samples = rtile.sample;
1806  denoising.buffer.gpu_temporary_mem = true;
1807 
1808  denoising.run_denoising(rtile);
1809 }
1810 
1811 void CUDADevice::adaptive_sampling_filter(uint filter_sample,
1812  WorkTile *wtile,
1813  CUdeviceptr d_wtile,
1814  CUstream stream)
1815 {
1816  const int num_threads_per_block = functions.adaptive_num_threads_per_block;
1817 
1818  /* These are a series of tiny kernels because there is no grid synchronization
1819  * from within a kernel, so multiple kernel launches it is. */
1820  uint total_work_size = wtile->h * wtile->w;
1821  void *args2[] = {&d_wtile, &filter_sample, &total_work_size};
1822  uint num_blocks = divide_up(total_work_size, num_threads_per_block);
1823  cuda_assert(cuLaunchKernel(functions.adaptive_stopping,
1824  num_blocks,
1825  1,
1826  1,
1827  num_threads_per_block,
1828  1,
1829  1,
1830  0,
1831  stream,
1832  args2,
1833  0));
1834  total_work_size = wtile->h;
1835  num_blocks = divide_up(total_work_size, num_threads_per_block);
1836  cuda_assert(cuLaunchKernel(functions.adaptive_filter_x,
1837  num_blocks,
1838  1,
1839  1,
1840  num_threads_per_block,
1841  1,
1842  1,
1843  0,
1844  stream,
1845  args2,
1846  0));
1847  total_work_size = wtile->w;
1848  num_blocks = divide_up(total_work_size, num_threads_per_block);
1849  cuda_assert(cuLaunchKernel(functions.adaptive_filter_y,
1850  num_blocks,
1851  1,
1852  1,
1853  num_threads_per_block,
1854  1,
1855  1,
1856  0,
1857  stream,
1858  args2,
1859  0));
1860 }
1861 
1862 void CUDADevice::adaptive_sampling_post(RenderTile &rtile,
1863  WorkTile *wtile,
1864  CUdeviceptr d_wtile,
1865  CUstream stream)
1866 {
1867  const int num_threads_per_block = functions.adaptive_num_threads_per_block;
1868  uint total_work_size = wtile->h * wtile->w;
1869 
1870  void *args[] = {&d_wtile, &rtile.start_sample, &rtile.sample, &total_work_size};
1871  uint num_blocks = divide_up(total_work_size, num_threads_per_block);
1872  cuda_assert(cuLaunchKernel(functions.adaptive_scale_samples,
1873  num_blocks,
1874  1,
1875  1,
1876  num_threads_per_block,
1877  1,
1878  1,
1879  0,
1880  stream,
1881  args,
1882  0));
1883 }
1884 
1885 void CUDADevice::render(DeviceTask &task, RenderTile &rtile, device_vector<WorkTile> &work_tiles)
1886 {
1887  scoped_timer timer(&rtile.buffers->render_time);
1888 
1889  if (have_error())
1890  return;
1891 
1892  CUDAContextScope scope(this);
1893  CUfunction cuRender;
1894 
1895  /* Get kernel function. */
1896  if (rtile.task == RenderTile::BAKE) {
1897  cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_bake"));
1898  }
1899  else if (task.integrator_branched) {
1900  cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_branched_path_trace"));
1901  }
1902  else {
1903  cuda_assert(cuModuleGetFunction(&cuRender, cuModule, "kernel_cuda_path_trace"));
1904  }
1905 
1906  if (have_error()) {
1907  return;
1908  }
1909 
1910  cuda_assert(cuFuncSetCacheConfig(cuRender, CU_FUNC_CACHE_PREFER_L1));
1911 
1912  /* Allocate work tile. */
1913  work_tiles.alloc(1);
1914 
1915  WorkTile *wtile = work_tiles.data();
1916  wtile->x = rtile.x;
1917  wtile->y = rtile.y;
1918  wtile->w = rtile.w;
1919  wtile->h = rtile.h;
1920  wtile->offset = rtile.offset;
1921  wtile->stride = rtile.stride;
1922  wtile->buffer = (float *)(CUdeviceptr)rtile.buffer;
1923 
1924  /* Prepare work size. More step samples render faster, but for now we
1925  * remain conservative for GPUs connected to a display to avoid driver
1926  * timeouts and display freezing. */
1927  int min_blocks, num_threads_per_block;
1928  cuda_assert(
1929  cuOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, cuRender, NULL, 0, 0));
1930  if (!info.display_device) {
1931  min_blocks *= 8;
1932  }
1933 
1934  uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h);
1935 
1936  /* Render all samples. */
1937  int start_sample = rtile.start_sample;
1938  int end_sample = rtile.start_sample + rtile.num_samples;
1939 
1940  for (int sample = start_sample; sample < end_sample;) {
1941  /* Setup and copy work tile to device. */
1942  wtile->start_sample = sample;
1943  wtile->num_samples = step_samples;
1944  if (task.adaptive_sampling.use) {
1945  wtile->num_samples = task.adaptive_sampling.align_samples(sample, step_samples);
1946  }
1947  wtile->num_samples = min(wtile->num_samples, end_sample - sample);
1948  work_tiles.copy_to_device();
1949 
1950  CUdeviceptr d_work_tiles = (CUdeviceptr)work_tiles.device_pointer;
1951  uint total_work_size = wtile->w * wtile->h * wtile->num_samples;
1952  uint num_blocks = divide_up(total_work_size, num_threads_per_block);
1953 
1954  /* Launch kernel. */
1955  void *args[] = {&d_work_tiles, &total_work_size};
1956 
1957  cuda_assert(
1958  cuLaunchKernel(cuRender, num_blocks, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0));
1959 
1960  /* Run the adaptive sampling kernels at selected samples aligned to step samples. */
1961  uint filter_sample = sample + wtile->num_samples - 1;
1962  if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) {
1963  adaptive_sampling_filter(filter_sample, wtile, d_work_tiles);
1964  }
1965 
1966  cuda_assert(cuCtxSynchronize());
1967 
1968  /* Update progress. */
1969  sample += wtile->num_samples;
1970  rtile.sample = sample;
1971  task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples);
1972 
1973  if (task.get_cancel()) {
1974  if (task.need_finish_queue == false)
1975  break;
1976  }
1977  }
1978 
1979  /* Finalize adaptive sampling. */
1980  if (task.adaptive_sampling.use) {
1981  CUdeviceptr d_work_tiles = (CUdeviceptr)work_tiles.device_pointer;
1982  adaptive_sampling_post(rtile, wtile, d_work_tiles);
1983  cuda_assert(cuCtxSynchronize());
1984  task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples);
1985  }
1986 }
1987 
1988 void CUDADevice::film_convert(DeviceTask &task,
1990  device_ptr rgba_byte,
1991  device_ptr rgba_half)
1992 {
1993  if (have_error())
1994  return;
1995 
1996  CUDAContextScope scope(this);
1997 
1998  CUfunction cuFilmConvert;
1999  CUdeviceptr d_rgba = map_pixels((rgba_byte) ? rgba_byte : rgba_half);
2000  CUdeviceptr d_buffer = (CUdeviceptr)buffer;
2001 
2002  /* get kernel function */
2003  if (rgba_half) {
2004  cuda_assert(
2005  cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_half_float"));
2006  }
2007  else {
2008  cuda_assert(cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_byte"));
2009  }
2010 
2011  float sample_scale = 1.0f / (task.sample + 1);
2012 
2013  /* pass in parameters */
2014  void *args[] = {&d_rgba,
2015  &d_buffer,
2016  &sample_scale,
2017  &task.x,
2018  &task.y,
2019  &task.w,
2020  &task.h,
2021  &task.offset,
2022  &task.stride};
2023 
2024  /* launch kernel */
2025  int threads_per_block;
2026  cuda_assert(cuFuncGetAttribute(
2027  &threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuFilmConvert));
2028 
2029  int xthreads = (int)sqrt(threads_per_block);
2030  int ythreads = (int)sqrt(threads_per_block);
2031  int xblocks = (task.w + xthreads - 1) / xthreads;
2032  int yblocks = (task.h + ythreads - 1) / ythreads;
2033 
2034  cuda_assert(cuFuncSetCacheConfig(cuFilmConvert, CU_FUNC_CACHE_PREFER_L1));
2035 
2036  cuda_assert(cuLaunchKernel(cuFilmConvert,
2037  xblocks,
2038  yblocks,
2039  1, /* blocks */
2040  xthreads,
2041  ythreads,
2042  1, /* threads */
2043  0,
2044  0,
2045  args,
2046  0));
2047 
2048  unmap_pixels((rgba_byte) ? rgba_byte : rgba_half);
2049 
2050  cuda_assert(cuCtxSynchronize());
2051 }
2052 
2054 {
2055  if (have_error())
2056  return;
2057 
2058  CUDAContextScope scope(this);
2059 
2060  CUfunction cuShader;
2061  CUdeviceptr d_input = (CUdeviceptr)task.shader_input;
2062  CUdeviceptr d_output = (CUdeviceptr)task.shader_output;
2063 
2064  /* get kernel function */
2065  if (task.shader_eval_type == SHADER_EVAL_DISPLACE) {
2066  cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_displace"));
2067  }
2068  else {
2069  cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_background"));
2070  }
2071 
2072  /* do tasks in smaller chunks, so we can cancel it */
2073  const int shader_chunk_size = 65536;
2074  const int start = task.shader_x;
2075  const int end = task.shader_x + task.shader_w;
2076  int offset = task.offset;
2077 
2078  bool canceled = false;
2079  for (int sample = 0; sample < task.num_samples && !canceled; sample++) {
2080  for (int shader_x = start; shader_x < end; shader_x += shader_chunk_size) {
2081  int shader_w = min(shader_chunk_size, end - shader_x);
2082 
2083  /* pass in parameters */
2084  void *args[8];
2085  int arg = 0;
2086  args[arg++] = &d_input;
2087  args[arg++] = &d_output;
2088  args[arg++] = &task.shader_eval_type;
2089  if (task.shader_eval_type >= SHADER_EVAL_BAKE) {
2090  args[arg++] = &task.shader_filter;
2091  }
2092  args[arg++] = &shader_x;
2093  args[arg++] = &shader_w;
2094  args[arg++] = &offset;
2095  args[arg++] = &sample;
2096 
2097  /* launch kernel */
2098  int threads_per_block;
2099  cuda_assert(cuFuncGetAttribute(
2100  &threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuShader));
2101 
2102  int xblocks = (shader_w + threads_per_block - 1) / threads_per_block;
2103 
2104  cuda_assert(cuFuncSetCacheConfig(cuShader, CU_FUNC_CACHE_PREFER_L1));
2105  cuda_assert(cuLaunchKernel(cuShader,
2106  xblocks,
2107  1,
2108  1, /* blocks */
2109  threads_per_block,
2110  1,
2111  1, /* threads */
2112  0,
2113  0,
2114  args,
2115  0));
2116 
2117  cuda_assert(cuCtxSynchronize());
2118 
2119  if (task.get_cancel()) {
2120  canceled = true;
2121  break;
2122  }
2123  }
2124 
2125  task.update_progress(NULL);
2126  }
2127 }
2128 
2129 CUdeviceptr CUDADevice::map_pixels(device_ptr mem)
2130 {
2131  if (!background) {
2132  PixelMem pmem = pixel_mem_map[mem];
2133  CUdeviceptr buffer;
2134 
2135  size_t bytes;
2136  cuda_assert(cuGraphicsMapResources(1, &pmem.cuPBOresource, 0));
2137  cuda_assert(cuGraphicsResourceGetMappedPointer(&buffer, &bytes, pmem.cuPBOresource));
2138 
2139  return buffer;
2140  }
2141 
2142  return (CUdeviceptr)mem;
2143 }
2144 
2145 void CUDADevice::unmap_pixels(device_ptr mem)
2146 {
2147  if (!background) {
2148  PixelMem pmem = pixel_mem_map[mem];
2149 
2150  cuda_assert(cuGraphicsUnmapResources(1, &pmem.cuPBOresource, 0));
2151  }
2152 }
2153 
2154 void CUDADevice::pixels_alloc(device_memory &mem)
2155 {
2156  PixelMem pmem;
2157 
2158  pmem.w = mem.data_width;
2159  pmem.h = mem.data_height;
2160 
2161  CUDAContextScope scope(this);
2162 
2163  glGenBuffers(1, &pmem.cuPBO);
2164  glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
2165  if (mem.data_type == TYPE_HALF)
2166  glBufferData(
2167  GL_PIXEL_UNPACK_BUFFER, pmem.w * pmem.h * sizeof(GLhalf) * 4, NULL, GL_DYNAMIC_DRAW);
2168  else
2169  glBufferData(
2170  GL_PIXEL_UNPACK_BUFFER, pmem.w * pmem.h * sizeof(uint8_t) * 4, NULL, GL_DYNAMIC_DRAW);
2171 
2172  glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
2173 
2174  glActiveTexture(GL_TEXTURE0);
2175  glGenTextures(1, &pmem.cuTexId);
2176  glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
2177  if (mem.data_type == TYPE_HALF)
2178  glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F, pmem.w, pmem.h, 0, GL_RGBA, GL_HALF_FLOAT, NULL);
2179  else
2180  glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, pmem.w, pmem.h, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
2181  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
2182  glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
2183  glBindTexture(GL_TEXTURE_2D, 0);
2184 
2185  CUresult result = cuGraphicsGLRegisterBuffer(
2186  &pmem.cuPBOresource, pmem.cuPBO, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE);
2187 
2188  if (result == CUDA_SUCCESS) {
2189  mem.device_pointer = pmem.cuTexId;
2190  pixel_mem_map[mem.device_pointer] = pmem;
2191 
2192  mem.device_size = mem.memory_size();
2193  stats.mem_alloc(mem.device_size);
2194 
2195  return;
2196  }
2197  else {
2198  /* failed to register buffer, fallback to no interop */
2199  glDeleteBuffers(1, &pmem.cuPBO);
2200  glDeleteTextures(1, &pmem.cuTexId);
2201 
2202  background = true;
2203  }
2204 }
2205 
2206 void CUDADevice::pixels_copy_from(device_memory &mem, int y, int w, int h)
2207 {
2208  PixelMem pmem = pixel_mem_map[mem.device_pointer];
2209 
2210  CUDAContextScope scope(this);
2211 
2212  glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
2213  uchar *pixels = (uchar *)glMapBuffer(GL_PIXEL_UNPACK_BUFFER, GL_READ_ONLY);
2214  size_t offset = sizeof(uchar) * 4 * y * w;
2215  memcpy((uchar *)mem.host_pointer + offset, pixels + offset, sizeof(uchar) * 4 * w * h);
2216  glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
2217  glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
2218 }
2219 
2220 void CUDADevice::pixels_free(device_memory &mem)
2221 {
2222  if (mem.device_pointer) {
2223  PixelMem pmem = pixel_mem_map[mem.device_pointer];
2224 
2225  CUDAContextScope scope(this);
2226 
2227  cuda_assert(cuGraphicsUnregisterResource(pmem.cuPBOresource));
2228  glDeleteBuffers(1, &pmem.cuPBO);
2229  glDeleteTextures(1, &pmem.cuTexId);
2230 
2231  pixel_mem_map.erase(pixel_mem_map.find(mem.device_pointer));
2232  mem.device_pointer = 0;
2233 
2234  stats.mem_free(mem.device_size);
2235  mem.device_size = 0;
2236  }
2237 }
2238 
2239 void CUDADevice::draw_pixels(device_memory &mem,
2240  int y,
2241  int w,
2242  int h,
2243  int width,
2244  int height,
2245  int dx,
2246  int dy,
2247  int dw,
2248  int dh,
2249  bool transparent,
2250  const DeviceDrawParams &draw_params)
2251 {
2252  assert(mem.type == MEM_PIXELS);
2253 
2254  if (!background) {
2255  const bool use_fallback_shader = (draw_params.bind_display_space_shader_cb == NULL);
2256  PixelMem pmem = pixel_mem_map[mem.device_pointer];
2257  float *vpointer;
2258 
2259  CUDAContextScope scope(this);
2260 
2261  /* for multi devices, this assumes the inefficient method that we allocate
2262  * all pixels on the device even though we only render to a subset */
2263  size_t offset = 4 * y * w;
2264 
2265  if (mem.data_type == TYPE_HALF)
2266  offset *= sizeof(GLhalf);
2267  else
2268  offset *= sizeof(uint8_t);
2269 
2270  glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
2271  glActiveTexture(GL_TEXTURE0);
2272  glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
2273  if (mem.data_type == TYPE_HALF) {
2274  glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_HALF_FLOAT, (void *)offset);
2275  }
2276  else {
2277  glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_UNSIGNED_BYTE, (void *)offset);
2278  }
2279  glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
2280 
2281  if (transparent) {
2282  glEnable(GL_BLEND);
2283  glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_ALPHA);
2284  }
2285 
2286  GLint shader_program;
2287  if (use_fallback_shader) {
2288  if (!bind_fallback_display_space_shader(dw, dh)) {
2289  return;
2290  }
2291  shader_program = fallback_shader_program;
2292  }
2293  else {
2294  draw_params.bind_display_space_shader_cb();
2295  glGetIntegerv(GL_CURRENT_PROGRAM, &shader_program);
2296  }
2297 
2298  if (!vertex_buffer) {
2299  glGenBuffers(1, &vertex_buffer);
2300  }
2301 
2302  glBindBuffer(GL_ARRAY_BUFFER, vertex_buffer);
2303  /* invalidate old contents -
2304  * avoids stalling if buffer is still waiting in queue to be rendered */
2305  glBufferData(GL_ARRAY_BUFFER, 16 * sizeof(float), NULL, GL_STREAM_DRAW);
2306 
2307  vpointer = (float *)glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY);
2308 
2309  if (vpointer) {
2310  /* texture coordinate - vertex pair */
2311  vpointer[0] = 0.0f;
2312  vpointer[1] = 0.0f;
2313  vpointer[2] = dx;
2314  vpointer[3] = dy;
2315 
2316  vpointer[4] = (float)w / (float)pmem.w;
2317  vpointer[5] = 0.0f;
2318  vpointer[6] = (float)width + dx;
2319  vpointer[7] = dy;
2320 
2321  vpointer[8] = (float)w / (float)pmem.w;
2322  vpointer[9] = (float)h / (float)pmem.h;
2323  vpointer[10] = (float)width + dx;
2324  vpointer[11] = (float)height + dy;
2325 
2326  vpointer[12] = 0.0f;
2327  vpointer[13] = (float)h / (float)pmem.h;
2328  vpointer[14] = dx;
2329  vpointer[15] = (float)height + dy;
2330 
2331  glUnmapBuffer(GL_ARRAY_BUFFER);
2332  }
2333 
2334  GLuint vertex_array_object;
2335  GLuint position_attribute, texcoord_attribute;
2336 
2337  glGenVertexArrays(1, &vertex_array_object);
2338  glBindVertexArray(vertex_array_object);
2339 
2340  texcoord_attribute = glGetAttribLocation(shader_program, "texCoord");
2341  position_attribute = glGetAttribLocation(shader_program, "pos");
2342 
2343  glEnableVertexAttribArray(texcoord_attribute);
2344  glEnableVertexAttribArray(position_attribute);
2345 
2346  glVertexAttribPointer(
2347  texcoord_attribute, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(float), (const GLvoid *)0);
2348  glVertexAttribPointer(position_attribute,
2349  2,
2350  GL_FLOAT,
2351  GL_FALSE,
2352  4 * sizeof(float),
2353  (const GLvoid *)(sizeof(float) * 2));
2354 
2355  glDrawArrays(GL_TRIANGLE_FAN, 0, 4);
2356 
2357  if (use_fallback_shader) {
2358  glUseProgram(0);
2359  }
2360  else {
2361  draw_params.unbind_display_space_shader_cb();
2362  }
2363 
2364  if (transparent) {
2365  glDisable(GL_BLEND);
2366  }
2367 
2368  glBindTexture(GL_TEXTURE_2D, 0);
2369 
2370  return;
2371  }
2372 
2373  Device::draw_pixels(mem, y, w, h, width, height, dx, dy, dw, dh, transparent, draw_params);
2374 }
2375 
2376 void CUDADevice::thread_run(DeviceTask &task)
2377 {
2378  CUDAContextScope scope(this);
2379 
2380  if (task.type == DeviceTask::RENDER) {
2381  DeviceRequestedFeatures requested_features;
2382  if (use_split_kernel()) {
2383  if (split_kernel == NULL) {
2384  split_kernel = new CUDASplitKernel(this);
2385  split_kernel->load_kernels(requested_features);
2386  }
2387  }
2388 
2389  device_vector<WorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY);
2390 
2391  /* keep rendering tiles until done */
2392  RenderTile tile;
2393  DenoisingTask denoising(this, task);
2394 
2395  while (task.acquire_tile(this, tile, task.tile_types)) {
2396  if (tile.task == RenderTile::PATH_TRACE) {
2397  if (use_split_kernel()) {
2398  device_only_memory<uchar> void_buffer(this, "void_buffer");
2399  split_kernel->path_trace(task, tile, void_buffer, void_buffer);
2400  }
2401  else {
2402  render(task, tile, work_tiles);
2403  }
2404  }
2405  else if (tile.task == RenderTile::BAKE) {
2406  render(task, tile, work_tiles);
2407  }
2408  else if (tile.task == RenderTile::DENOISE) {
2409  tile.sample = tile.start_sample + tile.num_samples;
2410 
2411  denoise(tile, denoising);
2412 
2413  task.update_progress(&tile, tile.w * tile.h);
2414  }
2415 
2416  task.release_tile(tile);
2417 
2418  if (task.get_cancel()) {
2419  if (task.need_finish_queue == false)
2420  break;
2421  }
2422  }
2423 
2424  work_tiles.free();
2425  }
2426  else if (task.type == DeviceTask::SHADER) {
2427  shader(task);
2428 
2429  cuda_assert(cuCtxSynchronize());
2430  }
2431  else if (task.type == DeviceTask::DENOISE_BUFFER) {
2432  RenderTile tile;
2433  tile.x = task.x;
2434  tile.y = task.y;
2435  tile.w = task.w;
2436  tile.h = task.h;
2437  tile.buffer = task.buffer;
2438  tile.sample = task.sample + task.num_samples;
2439  tile.num_samples = task.num_samples;
2440  tile.start_sample = task.sample;
2441  tile.offset = task.offset;
2442  tile.stride = task.stride;
2443  tile.buffers = task.buffers;
2444 
2445  DenoisingTask denoising(this, task);
2446  denoise(tile, denoising);
2447  task.update_progress(&tile, tile.w * tile.h);
2448  }
2449 }
2450 
2451 void CUDADevice::task_add(DeviceTask &task)
2452 {
2453  CUDAContextScope scope(this);
2454 
2455  /* Load texture info. */
2456  load_texture_info();
2457 
2458  /* Synchronize all memory copies before executing task. */
2459  cuda_assert(cuCtxSynchronize());
2460 
2461  if (task.type == DeviceTask::FILM_CONVERT) {
2462  /* must be done in main thread due to opengl access */
2463  film_convert(task, task.buffer, task.rgba_byte, task.rgba_half);
2464  }
2465  else {
2466  task_pool.push([=] {
2467  DeviceTask task_copy = task;
2468  thread_run(task_copy);
2469  });
2470  }
2471 }
2472 
2473 void CUDADevice::task_wait()
2474 {
2475  task_pool.wait();
2476 }
2477 
2478 void CUDADevice::task_cancel()
2479 {
2480  task_pool.cancel();
2481 }
2482 
2483 /* redefine the cuda_assert macro so it can be used outside of the CUDADevice class
2484  * now that the definition of that class is complete
2485  */
2486 # undef cuda_assert
2487 # define cuda_assert(stmt) \
2488  { \
2489  CUresult result = stmt; \
2490  if (result != CUDA_SUCCESS) { \
2491  const char *name = cuewErrorString(result); \
2492  device->set_error( \
2493  string_printf("%s in %s (device_cuda_impl.cpp:%d)", name, #stmt, __LINE__)); \
2494  } \
2495  } \
2496  (void)0
2497 
2498 /* CUDA context scope. */
2499 
2500 CUDAContextScope::CUDAContextScope(CUDADevice *device) : device(device)
2501 {
2502  cuda_assert(cuCtxPushCurrent(device->cuContext));
2503 }
2504 
2505 CUDAContextScope::~CUDAContextScope()
2506 {
2507  cuda_assert(cuCtxPopCurrent(NULL));
2508 }
2509 
2510 /* split kernel */
2511 
2512 class CUDASplitKernelFunction : public SplitKernelFunction {
2513  CUDADevice *device;
2514  CUfunction func;
2515 
2516  public:
2517  CUDASplitKernelFunction(CUDADevice *device, CUfunction func) : device(device), func(func)
2518  {
2519  }
2520 
2521  /* enqueue the kernel, returns false if there is an error */
2522  bool enqueue(const KernelDimensions &dim, device_memory & /*kg*/, device_memory & /*data*/)
2523  {
2524  return enqueue(dim, NULL);
2525  }
2526 
2527  /* enqueue the kernel, returns false if there is an error */
2528  bool enqueue(const KernelDimensions &dim, void *args[])
2529  {
2530  if (device->have_error())
2531  return false;
2532 
2533  CUDAContextScope scope(device);
2534 
2535  /* we ignore dim.local_size for now, as this is faster */
2536  int threads_per_block;
2537  cuda_assert(
2538  cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func));
2539 
2540  int xblocks = (dim.global_size[0] * dim.global_size[1] + threads_per_block - 1) /
2541  threads_per_block;
2542 
2543  cuda_assert(cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_L1));
2544 
2545  cuda_assert(cuLaunchKernel(func,
2546  xblocks,
2547  1,
2548  1, /* blocks */
2549  threads_per_block,
2550  1,
2551  1, /* threads */
2552  0,
2553  0,
2554  args,
2555  0));
2556 
2557  return !device->have_error();
2558  }
2559 };
2560 
2561 CUDASplitKernel::CUDASplitKernel(CUDADevice *device) : DeviceSplitKernel(device), device(device)
2562 {
2563 }
2564 
2565 uint64_t CUDASplitKernel::state_buffer_size(device_memory & /*kg*/,
2566  device_memory & /*data*/,
2567  size_t num_threads)
2568 {
2569  CUDAContextScope scope(device);
2570 
2571  device_vector<uint64_t> size_buffer(device, "size_buffer", MEM_READ_WRITE);
2572  size_buffer.alloc(1);
2573  size_buffer.zero_to_device();
2574 
2575  uint threads = num_threads;
2576  CUdeviceptr d_size = (CUdeviceptr)size_buffer.device_pointer;
2577 
2578  struct args_t {
2579  uint *num_threads;
2580  CUdeviceptr *size;
2581  };
2582 
2583  args_t args = {&threads, &d_size};
2584 
2585  CUfunction state_buffer_size;
2586  cuda_assert(
2587  cuModuleGetFunction(&state_buffer_size, device->cuModule, "kernel_cuda_state_buffer_size"));
2588 
2589  cuda_assert(cuLaunchKernel(state_buffer_size, 1, 1, 1, 1, 1, 1, 0, 0, (void **)&args, 0));
2590 
2591  size_buffer.copy_from_device(0, 1, 1);
2592  size_t size = size_buffer[0];
2593  size_buffer.free();
2594 
2595  return size;
2596 }
2597 
2598 bool CUDASplitKernel::enqueue_split_kernel_data_init(const KernelDimensions &dim,
2599  RenderTile &rtile,
2600  int num_global_elements,
2601  device_memory & /*kernel_globals*/,
2602  device_memory & /*kernel_data*/,
2603  device_memory &split_data,
2607  device_memory &work_pool_wgs)
2608 {
2609  CUDAContextScope scope(device);
2610 
2611  CUdeviceptr d_split_data = (CUdeviceptr)split_data.device_pointer;
2612  CUdeviceptr d_ray_state = (CUdeviceptr)ray_state.device_pointer;
2613  CUdeviceptr d_queue_index = (CUdeviceptr)queue_index.device_pointer;
2614  CUdeviceptr d_use_queues_flag = (CUdeviceptr)use_queues_flag.device_pointer;
2615  CUdeviceptr d_work_pool_wgs = (CUdeviceptr)work_pool_wgs.device_pointer;
2616 
2617  CUdeviceptr d_buffer = (CUdeviceptr)rtile.buffer;
2618 
2619  int end_sample = rtile.start_sample + rtile.num_samples;
2620  int queue_size = dim.global_size[0] * dim.global_size[1];
2621 
2622  struct args_t {
2623  CUdeviceptr *split_data_buffer;
2624  int *num_elements;
2625  CUdeviceptr *ray_state;
2626  int *start_sample;
2627  int *end_sample;
2628  int *sx;
2629  int *sy;
2630  int *sw;
2631  int *sh;
2632  int *offset;
2633  int *stride;
2634  CUdeviceptr *queue_index;
2635  int *queuesize;
2636  CUdeviceptr *use_queues_flag;
2637  CUdeviceptr *work_pool_wgs;
2638  int *num_samples;
2639  CUdeviceptr *buffer;
2640  };
2641 
2642  args_t args = {&d_split_data,
2643  &num_global_elements,
2644  &d_ray_state,
2645  &rtile.start_sample,
2646  &end_sample,
2647  &rtile.x,
2648  &rtile.y,
2649  &rtile.w,
2650  &rtile.h,
2651  &rtile.offset,
2652  &rtile.stride,
2653  &d_queue_index,
2654  &queue_size,
2655  &d_use_queues_flag,
2656  &d_work_pool_wgs,
2657  &rtile.num_samples,
2658  &d_buffer};
2659 
2660  CUfunction data_init;
2661  cuda_assert(
2662  cuModuleGetFunction(&data_init, device->cuModule, "kernel_cuda_path_trace_data_init"));
2663  if (device->have_error()) {
2664  return false;
2665  }
2666 
2667  CUDASplitKernelFunction(device, data_init).enqueue(dim, (void **)&args);
2668 
2669  return !device->have_error();
2670 }
2671 
2672 SplitKernelFunction *CUDASplitKernel::get_split_kernel_function(const string &kernel_name,
2673  const DeviceRequestedFeatures &)
2674 {
2675  const CUDAContextScope scope(device);
2676 
2677  CUfunction func;
2678  const CUresult result = cuModuleGetFunction(
2679  &func, device->cuModule, (string("kernel_cuda_") + kernel_name).data());
2680  if (result != CUDA_SUCCESS) {
2681  device->set_error(string_printf("Could not find kernel \"kernel_cuda_%s\" in module (%s)",
2682  kernel_name.data(),
2683  cuewErrorString(result)));
2684  return NULL;
2685  }
2686 
2687  return new CUDASplitKernelFunction(device, func);
2688 }
2689 
2690 int2 CUDASplitKernel::split_kernel_local_size()
2691 {
2692  return make_int2(32, 1);
2693 }
2694 
2695 int2 CUDASplitKernel::split_kernel_global_size(device_memory &kg,
2697  DeviceTask & /*task*/)
2698 {
2699  CUDAContextScope scope(device);
2700  size_t free;
2701  size_t total;
2702 
2703  cuda_assert(cuMemGetInfo(&free, &total));
2704 
2705  VLOG(1) << "Maximum device allocation size: " << string_human_readable_number(free)
2706  << " bytes. (" << string_human_readable_size(free) << ").";
2707 
2708  size_t num_elements = max_elements_for_max_buffer_size(kg, data, free / 2);
2709  size_t side = round_down((int)sqrt(num_elements), 32);
2710  int2 global_size = make_int2(side, round_down(num_elements / side, 16));
2711  VLOG(1) << "Global size: " << global_size << ".";
2712  return global_size;
2713 }
2714 
2716 
2717 #endif
typedef float(TangentPoint)[2]
sqrt(x)+1/max(0
void BLI_kdtree_nd_() free(KDTree *tree)
Definition: kdtree_impl.h:116
unsigned char uchar
Definition: BLI_sys_types.h:86
unsigned int uint
Definition: BLI_sys_types.h:83
_GL_VOID GLfloat value _GL_VOID_RET _GL_VOID const GLuint GLboolean *residences _GL_BOOL_RET _GL_VOID GLsizei GLfloat GLfloat GLfloat GLfloat const GLubyte *bitmap _GL_VOID_RET _GL_VOID GLenum const void *lists _GL_VOID_RET _GL_VOID const GLdouble *equation _GL_VOID_RET _GL_VOID GLdouble GLdouble blue _GL_VOID_RET _GL_VOID GLfloat GLfloat blue _GL_VOID_RET _GL_VOID GLint GLint blue _GL_VOID_RET _GL_VOID GLshort GLshort blue _GL_VOID_RET _GL_VOID GLubyte GLubyte blue _GL_VOID_RET _GL_VOID GLuint GLuint blue _GL_VOID_RET _GL_VOID GLushort GLushort blue _GL_VOID_RET _GL_VOID GLbyte GLbyte GLbyte alpha _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble alpha _GL_VOID_RET _GL_VOID GLfloat GLfloat GLfloat alpha _GL_VOID_RET _GL_VOID GLint GLint GLint alpha _GL_VOID_RET _GL_VOID GLshort GLshort GLshort alpha _GL_VOID_RET _GL_VOID GLubyte GLubyte GLubyte alpha _GL_VOID_RET _GL_VOID GLuint GLuint GLuint alpha _GL_VOID_RET _GL_VOID GLushort GLushort GLushort alpha _GL_VOID_RET _GL_VOID GLenum mode _GL_VOID_RET _GL_VOID GLint GLsizei GLsizei GLenum type _GL_VOID_RET _GL_VOID GLsizei GLenum GLenum const void *pixels _GL_VOID_RET _GL_VOID const void *pointer _GL_VOID_RET _GL_VOID GLdouble v _GL_VOID_RET _GL_VOID GLfloat v _GL_VOID_RET _GL_VOID GLint GLint i2 _GL_VOID_RET _GL_VOID GLint j _GL_VOID_RET _GL_VOID GLfloat param _GL_VOID_RET _GL_VOID GLint param _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble GLdouble GLdouble zFar _GL_VOID_RET _GL_UINT GLdouble *equation _GL_VOID_RET _GL_VOID GLenum GLint *params _GL_VOID_RET _GL_VOID GLenum GLfloat *v _GL_VOID_RET _GL_VOID GLenum GLfloat *params _GL_VOID_RET _GL_VOID GLfloat *values _GL_VOID_RET _GL_VOID GLushort *values _GL_VOID_RET _GL_VOID GLenum GLfloat *params _GL_VOID_RET _GL_VOID GLenum GLdouble *params _GL_VOID_RET _GL_VOID GLenum GLint *params _GL_VOID_RET _GL_VOID GLsizei const void *pointer _GL_VOID_RET _GL_VOID GLsizei const void *pointer _GL_VOID_RET _GL_BOOL GLfloat param _GL_VOID_RET _GL_VOID GLint param _GL_VOID_RET _GL_VOID GLenum GLfloat param _GL_VOID_RET _GL_VOID GLenum GLint param _GL_VOID_RET _GL_VOID GLushort pattern _GL_VOID_RET _GL_VOID GLdouble GLdouble GLint GLint const GLdouble *points _GL_VOID_RET _GL_VOID GLdouble GLdouble GLint GLint GLdouble GLdouble GLint GLint const GLdouble *points _GL_VOID_RET _GL_VOID GLdouble GLdouble u2 _GL_VOID_RET _GL_VOID GLdouble GLdouble GLint GLdouble GLdouble v2 _GL_VOID_RET _GL_VOID GLenum GLfloat param _GL_VOID_RET _GL_VOID GLenum GLint param _GL_VOID_RET _GL_VOID GLenum mode _GL_VOID_RET _GL_VOID GLdouble GLdouble nz _GL_VOID_RET _GL_VOID GLfloat GLfloat nz _GL_VOID_RET _GL_VOID GLint GLint nz _GL_VOID_RET _GL_VOID GLshort GLshort nz _GL_VOID_RET _GL_VOID GLsizei const void *pointer _GL_VOID_RET _GL_VOID GLsizei const GLfloat *values _GL_VOID_RET _GL_VOID GLsizei const GLushort *values _GL_VOID_RET _GL_VOID GLint param _GL_VOID_RET _GL_VOID const GLuint const GLclampf *priorities _GL_VOID_RET _GL_VOID GLdouble y _GL_VOID_RET _GL_VOID GLfloat y _GL_VOID_RET _GL_VOID GLint y _GL_VOID_RET _GL_VOID GLshort y _GL_VOID_RET _GL_VOID GLdouble GLdouble z _GL_VOID_RET _GL_VOID GLfloat GLfloat z _GL_VOID_RET _GL_VOID GLint GLint z _GL_VOID_RET _GL_VOID GLshort GLshort z _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble w _GL_VOID_RET _GL_VOID GLfloat GLfloat GLfloat w _GL_VOID_RET _GL_VOID GLint GLint GLint w _GL_VOID_RET _GL_VOID GLshort GLshort GLshort w _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble y2 _GL_VOID_RET _GL_VOID GLfloat GLfloat GLfloat y2 _GL_VOID_RET _GL_VOID GLint GLint GLint y2 _GL_VOID_RET _GL_VOID GLshort GLshort GLshort y2 _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble z _GL_VOID_RET _GL_VOID GLdouble GLdouble z _GL_VOID_RET _GL_VOID GLuint *buffer _GL_VOID_RET _GL_VOID GLdouble t _GL_VOID_RET _GL_VOID GLfloat t _GL_VOID_RET _GL_VOID GLint t _GL_VOID_RET _GL_VOID GLshort t _GL_VOID_RET _GL_VOID GLdouble GLdouble r _GL_VOID_RET _GL_VOID GLfloat GLfloat r _GL_VOID_RET _GL_VOID GLint GLint r _GL_VOID_RET _GL_VOID GLshort GLshort r _GL_VOID_RET _GL_VOID GLdouble GLdouble r
_GL_VOID GLfloat value _GL_VOID_RET _GL_VOID const GLuint GLboolean *residences _GL_BOOL_RET _GL_VOID GLsizei GLfloat GLfloat GLfloat GLfloat const GLubyte *bitmap _GL_VOID_RET _GL_VOID GLenum const void *lists _GL_VOID_RET _GL_VOID const GLdouble *equation _GL_VOID_RET _GL_VOID GLdouble GLdouble blue _GL_VOID_RET _GL_VOID GLfloat GLfloat blue _GL_VOID_RET _GL_VOID GLint GLint blue _GL_VOID_RET _GL_VOID GLshort GLshort blue _GL_VOID_RET _GL_VOID GLubyte GLubyte blue _GL_VOID_RET _GL_VOID GLuint GLuint blue _GL_VOID_RET _GL_VOID GLushort GLushort blue _GL_VOID_RET _GL_VOID GLbyte GLbyte GLbyte alpha _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble alpha _GL_VOID_RET _GL_VOID GLfloat GLfloat GLfloat alpha _GL_VOID_RET _GL_VOID GLint GLint GLint alpha _GL_VOID_RET _GL_VOID GLshort GLshort GLshort alpha _GL_VOID_RET _GL_VOID GLubyte GLubyte GLubyte alpha _GL_VOID_RET _GL_VOID GLuint GLuint GLuint alpha _GL_VOID_RET _GL_VOID GLushort GLushort GLushort alpha _GL_VOID_RET _GL_VOID GLenum mode _GL_VOID_RET _GL_VOID GLint GLsizei width
#define glEnable
_GL_VOID GLfloat value _GL_VOID_RET _GL_VOID const GLuint GLboolean *residences _GL_BOOL_RET _GL_VOID GLsizei height
_GL_VOID GLfloat value _GL_VOID_RET _GL_VOID const GLuint GLboolean *residences _GL_BOOL_RET _GL_VOID GLsizei GLfloat GLfloat GLfloat GLfloat const GLubyte *bitmap _GL_VOID_RET _GL_VOID GLenum const void *lists _GL_VOID_RET _GL_VOID const GLdouble *equation _GL_VOID_RET _GL_VOID GLdouble GLdouble blue _GL_VOID_RET _GL_VOID GLfloat GLfloat blue _GL_VOID_RET _GL_VOID GLint GLint blue _GL_VOID_RET _GL_VOID GLshort GLshort blue _GL_VOID_RET _GL_VOID GLubyte GLubyte blue _GL_VOID_RET _GL_VOID GLuint GLuint blue _GL_VOID_RET _GL_VOID GLushort GLushort blue _GL_VOID_RET _GL_VOID GLbyte GLbyte GLbyte alpha _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble alpha _GL_VOID_RET _GL_VOID GLfloat GLfloat GLfloat alpha _GL_VOID_RET _GL_VOID GLint GLint GLint alpha _GL_VOID_RET _GL_VOID GLshort GLshort GLshort alpha _GL_VOID_RET _GL_VOID GLubyte GLubyte GLubyte alpha _GL_VOID_RET _GL_VOID GLuint GLuint GLuint alpha _GL_VOID_RET _GL_VOID GLushort GLushort GLushort alpha _GL_VOID_RET _GL_VOID GLenum mode _GL_VOID_RET _GL_VOID GLint y
_GL_VOID GLfloat value _GL_VOID_RET _GL_VOID const GLuint GLboolean *residences _GL_BOOL_RET _GL_VOID GLsizei GLfloat GLfloat GLfloat GLfloat const GLubyte *bitmap _GL_VOID_RET _GL_VOID GLenum const void *lists _GL_VOID_RET _GL_VOID const GLdouble *equation _GL_VOID_RET _GL_VOID GLdouble GLdouble blue _GL_VOID_RET _GL_VOID GLfloat GLfloat blue _GL_VOID_RET _GL_VOID GLint GLint blue _GL_VOID_RET _GL_VOID GLshort GLshort blue _GL_VOID_RET _GL_VOID GLubyte GLubyte blue _GL_VOID_RET _GL_VOID GLuint GLuint blue _GL_VOID_RET _GL_VOID GLushort GLushort blue _GL_VOID_RET _GL_VOID GLbyte GLbyte GLbyte alpha _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble alpha _GL_VOID_RET _GL_VOID GLfloat GLfloat GLfloat alpha _GL_VOID_RET _GL_VOID GLint GLint GLint alpha _GL_VOID_RET _GL_VOID GLshort GLshort GLshort alpha _GL_VOID_RET _GL_VOID GLubyte GLubyte GLubyte alpha _GL_VOID_RET _GL_VOID GLuint GLuint GLuint alpha _GL_VOID_RET _GL_VOID GLushort GLushort GLushort alpha _GL_VOID_RET _GL_VOID GLenum mode _GL_VOID_RET _GL_VOID GLint GLsizei GLsizei GLenum type _GL_VOID_RET _GL_VOID GLsizei GLenum GLenum const void *pixels _GL_VOID_RET _GL_VOID const void *pointer _GL_VOID_RET _GL_VOID GLdouble v _GL_VOID_RET _GL_VOID GLfloat v _GL_VOID_RET _GL_VOID GLint GLint i2 _GL_VOID_RET _GL_VOID GLint j _GL_VOID_RET _GL_VOID GLfloat param _GL_VOID_RET _GL_VOID GLint param _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble GLdouble GLdouble zFar _GL_VOID_RET _GL_UINT GLdouble *equation _GL_VOID_RET _GL_VOID GLenum GLint *params _GL_VOID_RET _GL_VOID GLenum GLfloat *v _GL_VOID_RET _GL_VOID GLenum GLfloat *params _GL_VOID_RET _GL_VOID GLfloat *values _GL_VOID_RET _GL_VOID GLushort *values _GL_VOID_RET _GL_VOID GLenum GLfloat *params _GL_VOID_RET _GL_VOID GLenum GLdouble *params _GL_VOID_RET _GL_VOID GLenum GLint *params _GL_VOID_RET _GL_VOID GLsizei stride
_GL_VOID GLfloat value _GL_VOID_RET _GL_VOID const GLuint GLboolean *residences _GL_BOOL_RET _GL_VOID GLsizei GLfloat GLfloat GLfloat GLfloat const GLubyte *bitmap _GL_VOID_RET _GL_VOID GLenum const void *lists _GL_VOID_RET _GL_VOID const GLdouble *equation _GL_VOID_RET _GL_VOID GLdouble GLdouble blue _GL_VOID_RET _GL_VOID GLfloat GLfloat blue _GL_VOID_RET _GL_VOID GLint GLint blue _GL_VOID_RET _GL_VOID GLshort GLshort blue _GL_VOID_RET _GL_VOID GLubyte GLubyte blue _GL_VOID_RET _GL_VOID GLuint GLuint blue _GL_VOID_RET _GL_VOID GLushort GLushort blue _GL_VOID_RET _GL_VOID GLbyte GLbyte GLbyte alpha _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble alpha _GL_VOID_RET _GL_VOID GLfloat GLfloat GLfloat alpha _GL_VOID_RET _GL_VOID GLint GLint GLint alpha _GL_VOID_RET _GL_VOID GLshort GLshort GLshort alpha _GL_VOID_RET _GL_VOID GLubyte GLubyte GLubyte alpha _GL_VOID_RET _GL_VOID GLuint GLuint GLuint alpha _GL_VOID_RET _GL_VOID GLushort GLushort GLushort alpha _GL_VOID_RET _GL_VOID GLenum mode _GL_VOID_RET _GL_VOID GLint GLsizei GLsizei GLenum type _GL_VOID_RET _GL_VOID GLsizei GLenum GLenum const void *pixels _GL_VOID_RET _GL_VOID const void *pointer _GL_VOID_RET _GL_VOID GLdouble v _GL_VOID_RET _GL_VOID GLfloat v _GL_VOID_RET _GL_VOID GLint GLint i2 _GL_VOID_RET _GL_VOID GLint j _GL_VOID_RET _GL_VOID GLfloat param _GL_VOID_RET _GL_VOID GLint param _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble GLdouble GLdouble zFar _GL_VOID_RET _GL_UINT GLdouble *equation _GL_VOID_RET _GL_VOID GLenum GLint *params _GL_VOID_RET _GL_VOID GLenum GLfloat *v _GL_VOID_RET _GL_VOID GLenum GLfloat *params _GL_VOID_RET _GL_VOID GLfloat *values _GL_VOID_RET _GL_VOID GLushort *values _GL_VOID_RET _GL_VOID GLenum GLfloat *params _GL_VOID_RET _GL_VOID GLenum GLdouble *params _GL_VOID_RET _GL_VOID GLenum GLint *params _GL_VOID_RET _GL_VOID GLsizei const void *pointer _GL_VOID_RET _GL_VOID GLsizei const void *pointer _GL_VOID_RET _GL_BOOL GLfloat param _GL_VOID_RET _GL_VOID GLint param _GL_VOID_RET _GL_VOID GLenum GLfloat param _GL_VOID_RET _GL_VOID GLenum GLint param _GL_VOID_RET _GL_VOID GLushort pattern _GL_VOID_RET _GL_VOID GLdouble GLdouble GLint GLint const GLdouble *points _GL_VOID_RET _GL_VOID GLdouble GLdouble GLint GLint GLdouble GLdouble GLint GLint const GLdouble *points _GL_VOID_RET _GL_VOID GLdouble GLdouble u2 _GL_VOID_RET _GL_VOID GLdouble GLdouble GLint GLdouble GLdouble v2 _GL_VOID_RET _GL_VOID GLenum GLfloat param _GL_VOID_RET _GL_VOID GLenum GLint param _GL_VOID_RET _GL_VOID GLenum mode _GL_VOID_RET _GL_VOID GLdouble GLdouble nz _GL_VOID_RET _GL_VOID GLfloat GLfloat nz _GL_VOID_RET _GL_VOID GLint GLint nz _GL_VOID_RET _GL_VOID GLshort GLshort nz _GL_VOID_RET _GL_VOID GLsizei const void *pointer _GL_VOID_RET _GL_VOID GLsizei const GLfloat *values _GL_VOID_RET _GL_VOID GLsizei const GLushort *values _GL_VOID_RET _GL_VOID GLint param _GL_VOID_RET _GL_VOID const GLuint const GLclampf *priorities _GL_VOID_RET _GL_VOID GLdouble y _GL_VOID_RET _GL_VOID GLfloat y _GL_VOID_RET _GL_VOID GLint y _GL_VOID_RET _GL_VOID GLshort y _GL_VOID_RET _GL_VOID GLdouble GLdouble z _GL_VOID_RET _GL_VOID GLfloat GLfloat z _GL_VOID_RET _GL_VOID GLint GLint z _GL_VOID_RET _GL_VOID GLshort GLshort z _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble w _GL_VOID_RET _GL_VOID GLfloat GLfloat GLfloat w _GL_VOID_RET _GL_VOID GLint GLint GLint w _GL_VOID_RET _GL_VOID GLshort GLshort GLshort w _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble y2 _GL_VOID_RET _GL_VOID GLfloat GLfloat GLfloat y2 _GL_VOID_RET _GL_VOID GLint GLint GLint y2 _GL_VOID_RET _GL_VOID GLshort GLshort GLshort y2 _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble z _GL_VOID_RET _GL_VOID GLdouble GLdouble z _GL_VOID_RET _GL_VOID GLuint *buffer _GL_VOID_RET _GL_VOID GLdouble t _GL_VOID_RET _GL_VOID GLfloat t _GL_VOID_RET _GL_VOID GLint t _GL_VOID_RET _GL_VOID GLshort t _GL_VOID_RET _GL_VOID GLdouble t
#define glDisable
static DBVT_INLINE btScalar size(const btDbvtVolume &a)
Definition: btDbvt.cpp:52
SIMD_FORCE_INLINE const btScalar & w() const
Return the w value.
Definition: btQuadWord.h:119
int BVHLayoutMask
Definition: bvh_params.h:39
void run_denoising(RenderTile &tile)
struct DenoisingTask::RenderBuffers render_buffer
struct DenoisingTask::DeviceFunctions functions
struct DenoisingTask::DenoiseBuffers buffer
int num
Definition: device.h:77
bool display_device
Definition: device.h:78
DeviceType type
Definition: device.h:74
bool use_integrator_branched
Definition: device.h:159
string get_build_options() const
Definition: device.h:233
virtual bool enqueue_split_kernel_data_init(const KernelDimensions &dim, RenderTile &rtile, int num_global_elements, device_memory &kernel_globals, device_memory &kernel_data_, device_memory &split_data, device_memory &ray_state, device_memory &queue_index, device_memory &use_queues_flag, device_memory &work_pool_wgs)=0
virtual SplitKernelFunction * get_split_kernel_function(const string &kernel_name, const DeviceRequestedFeatures &)=0
virtual int2 split_kernel_local_size()=0
virtual uint64_t state_buffer_size(device_memory &kg, device_memory &data, size_t num_threads)=0
virtual int2 split_kernel_global_size(device_memory &kg, device_memory &data, DeviceTask &task)=0
Definition: device.h:293
virtual void draw_pixels(device_memory &mem, int y, int w, int h, int width, int height, int dx, int dy, int dw, int dh, bool transparent, const DeviceDrawParams &draw_params)
Definition: device.cpp:234
virtual void set_error(const string &error)
Definition: device.h:346
DeviceInfo info
Definition: device.h:337
double render_time
Definition: buffers.h:82
int stride
Definition: buffers.h:143
int sample
Definition: buffers.h:140
RenderBuffers * buffers
Definition: buffers.h:152
int num_samples
Definition: buffers.h:139
@ PATH_TRACE
Definition: buffers.h:134
device_ptr buffer
Definition: buffers.h:146
Task task
Definition: buffers.h:136
int offset
Definition: buffers.h:142
int start_sample
Definition: buffers.h:138
virtual bool enqueue(const KernelDimensions &dim, device_memory &kg, device_memory &data)=0
void mem_free(size_t size)
Definition: util_stats.h:42
void mem_alloc(size_t size)
Definition: util_stats.h:36
const char * name
MemoryType type
bool is_resident(Device *sub_device) const
size_t memory_elements_size(int elements)
size_t data_height
void * host_pointer
DataType data_type
size_t memory_size()
device_ptr device_pointer
void * shared_pointer
size_t device_size
TextureInfo info
T * alloc(size_t width, size_t height=0, size_t depth=0)
void copy_to_device()
#define function_bind
@ DEVICE_CUDA
Definition: device.h:47
@ DEVICE_OPTIX
Definition: device.h:50
static size_t datatype_size(DataType datatype)
Definition: device_memory.h:57
@ MEM_PIXELS
Definition: device_memory.h:41
@ MEM_GLOBAL
Definition: device_memory.h:39
@ MEM_TEXTURE
Definition: device_memory.h:40
@ MEM_READ_WRITE
Definition: device_memory.h:37
@ MEM_DEVICE_ONLY
Definition: device_memory.h:38
@ MEM_READ_ONLY
Definition: device_memory.h:36
@ TYPE_FLOAT
Definition: device_memory.h:52
@ TYPE_INT
Definition: device_memory.h:51
@ TYPE_HALF
Definition: device_memory.h:53
@ TYPE_UINT
Definition: device_memory.h:50
@ TYPE_UINT16
Definition: device_memory.h:49
@ TYPE_UCHAR
Definition: device_memory.h:48
TaskPool * task_pool
DO_INLINE void filter(lfVector *V, fmatrix3x3 *S)
#define CCL_NAMESPACE_END
#define make_int4(x, y, z, w)
#define make_int2(x, y)
void KERNEL_FUNCTION_FULL_NAME() data_init(KernelGlobals *kg, ccl_constant KernelData *data, ccl_global void *split_data_buffer, int num_elements, ccl_global char *ray_state, int start_sample, int end_sample, int sx, int sy, int sw, int sh, int offset, int stride, ccl_global int *Queue_index, int queuesize, ccl_global char *use_queues_flag, ccl_global unsigned int *work_pool_wgs, unsigned int num_samples, ccl_global float *buffer)
void KERNEL_FUNCTION_FULL_NAME() shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int filter, int i, int offset, int sample)
__kernel void ccl_constant KernelData ccl_global void ccl_global char ccl_global int ccl_global char ccl_global unsigned int ccl_global float * buffer
__kernel void ccl_constant KernelData ccl_global void ccl_global char ccl_global int ccl_global char * use_queues_flag
__kernel void ccl_constant KernelData ccl_global void ccl_global char ccl_global int * queue_index
__kernel void ccl_constant KernelData ccl_global void * split_data_buffer
__kernel void ccl_constant KernelData ccl_global void ccl_global char * ray_state
@ BVH_LAYOUT_BVH2
@ SHADER_EVAL_DISPLACE
Definition: kernel_types.h:197
@ SHADER_EVAL_BAKE
Definition: kernel_types.h:200
format
Definition: logImageCore.h:47
static void error(const char *str)
Definition: meshlaplacian.c:65
static unsigned a[3]
Definition: RandGen.cpp:92
static void sample(SocketReader *reader, int x, int y, float color[4])
ListBase threads
list of all thread for every CPUDevice in cpudevices a thread exists.
struct blender::compositor::@172::@174 task
void split(const std::string &s, const char delim, std::vector< std::string > &tokens)
Definition: abc_util.cc:115
#define min(a, b)
Definition: sort.c:51
unsigned char uint8_t
Definition: stdint.h:81
unsigned __int64 uint64_t
Definition: stdint.h:93
bool adaptive_compile
Definition: util_debug.h:96
function< bool(device_ptr a_ptr, device_ptr b_ptr, device_ptr mean_ptr, device_ptr variance_ptr, int r, int4 rect)> combine_halves
function< bool(device_ptr image_ptr, device_ptr variance_ptr, device_ptr depth_ptr, device_ptr output_ptr)> detect_outliers
function< bool(int out_offset, device_ptr frop_ptr, device_ptr buffer_ptr)> write_feature
function< bool(device_ptr output_ptr)> solve
function< bool(device_ptr a_ptr, device_ptr b_ptr, device_ptr sample_variance_ptr, device_ptr sv_variance_ptr, device_ptr buffer_variance_ptr)> divide_shadow
function< bool()> construct_transform
function< bool(int mean_offset, int variance_offset, device_ptr mean_ptr, device_ptr variance_ptr, float scale)> get_feature
function< bool(device_ptr color_ptr, device_ptr color_variance_ptr, device_ptr scale_ptr, int frame)> accumulate
function< bool(device_ptr image_ptr, device_ptr guide_ptr, device_ptr variance_ptr, device_ptr out_ptr)> non_local_means
function< void()> unbind_display_space_shader_cb
Definition: device.h:290
function< void()> bind_display_space_shader_cb
Definition: device.h:289
void push(TaskRunFunction &&task)
Definition: util_task.cpp:36
void cancel()
Definition: util_task.cpp:54
uint data_type
Definition: util_texture.h:99
uint interpolation
Definition: util_texture.h:103
uint start_sample
uint num_samples
ccl_global float * buffer
DebugFlags & DebugFlags()
Definition: util_debug.h:205
#define VLOG(severity)
Definition: util_logging.h:50
string util_md5_string(const string &str)
Definition: util_md5.cpp:380
string path_cache_get(const string &sub)
Definition: util_path.cpp:371
string path_get(const string &sub)
Definition: util_path.cpp:351
string path_files_md5_hash(const string &dir)
Definition: util_path.cpp:619
string path_join(const string &dir, const string &file)
Definition: util_path.cpp:426
bool path_exists(const string &path)
Definition: util_path.cpp:572
void path_create_directories(const string &filepath)
Definition: util_path.cpp:655
bool path_read_text(const string &path, string &text)
Definition: util_path.cpp:714
string string_human_readable_size(size_t size)
string string_human_readable_number(size_t num)
CCL_NAMESPACE_BEGIN string string_printf(const char *format,...)
Definition: util_string.cpp:32
size_t system_physical_ram()
int system_cpu_bits()
@ IMAGE_DATA_TYPE_NANOVDB_FLOAT
Definition: util_texture.h:60
@ IMAGE_DATA_TYPE_NANOVDB_FLOAT3
Definition: util_texture.h:61
@ INTERPOLATION_CLOSEST
Definition: util_texture.h:41
@ EXTENSION_REPEAT
Definition: util_texture.h:86
@ EXTENSION_CLIP
Definition: util_texture.h:90
@ EXTENSION_EXTEND
Definition: util_texture.h:88
std::unique_lock< std::mutex > thread_scoped_lock
Definition: util_thread.h:41
CCL_NAMESPACE_BEGIN typedef std::mutex thread_mutex
Definition: util_thread.h:40
CCL_NAMESPACE_BEGIN double time_dt()
Definition: util_time.cpp:48
ccl_device_inline size_t align_up(size_t offset, size_t alignment)
Definition: util_types.h:65
ccl_device_inline size_t round_down(size_t x, size_t multiple)
Definition: util_types.h:80
ccl_device_inline size_t divide_up(size_t x, size_t y)
Definition: util_types.h:70
uint64_t device_ptr
Definition: util_types.h:62