Blender  V2.93
device_opencl_impl.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2011-2013 Blender Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifdef WITH_OPENCL
18 
20 
21 # include "kernel/kernel_types.h"
23 
24 # include "util/util_algorithm.h"
25 # include "util/util_debug.h"
26 # include "util/util_foreach.h"
27 # include "util/util_logging.h"
28 # include "util/util_md5.h"
29 # include "util/util_path.h"
30 # include "util/util_time.h"
31 
33 
34 struct texture_slot_t {
35  texture_slot_t(const string &name, int slot) : name(name), slot(slot)
36  {
37  }
38  string name;
39  int slot;
40 };
41 
42 static const string NON_SPLIT_KERNELS =
43  "denoising "
44  "base "
45  "background "
46  "displace ";
47 
48 static const string SPLIT_BUNDLE_KERNELS =
49  "data_init "
50  "path_init "
51  "state_buffer_size "
52  "scene_intersect "
53  "queue_enqueue "
54  "shader_setup "
55  "shader_sort "
56  "enqueue_inactive "
57  "next_iteration_setup "
58  "indirect_subsurface "
59  "buffer_update "
60  "adaptive_stopping "
61  "adaptive_filter_x "
62  "adaptive_filter_y "
63  "adaptive_adjust_samples";
64 
65 const string OpenCLDevice::get_opencl_program_name(const string &kernel_name)
66 {
67  if (NON_SPLIT_KERNELS.find(kernel_name) != std::string::npos) {
68  return kernel_name;
69  }
70  else if (SPLIT_BUNDLE_KERNELS.find(kernel_name) != std::string::npos) {
71  return "split_bundle";
72  }
73  else {
74  return "split_" + kernel_name;
75  }
76 }
77 
78 const string OpenCLDevice::get_opencl_program_filename(const string &kernel_name)
79 {
80  if (kernel_name == "denoising") {
81  return "filter.cl";
82  }
83  else if (SPLIT_BUNDLE_KERNELS.find(kernel_name) != std::string::npos) {
84  return "kernel_split_bundle.cl";
85  }
86  else {
87  return "kernel_" + kernel_name + ".cl";
88  }
89 }
90 
91 /* Enable features that we always want to compile to reduce recompilation events */
92 void OpenCLDevice::enable_default_features(DeviceRequestedFeatures &features)
93 {
94  features.use_transparent = true;
95  features.use_shadow_tricks = true;
96  features.use_principled = true;
97  features.use_denoising = true;
98 
99  if (!background) {
101  features.nodes_features = NODE_FEATURE_ALL;
102  features.use_hair = true;
103  features.use_subsurface = true;
104  features.use_camera_motion = false;
105  features.use_object_motion = false;
106  }
107 }
108 
109 string OpenCLDevice::get_build_options(const DeviceRequestedFeatures &requested_features,
110  const string &opencl_program_name)
111 {
112  /* first check for non-split kernel programs */
113  if (opencl_program_name == "base" || opencl_program_name == "denoising") {
114  return "";
115  }
116  else if (opencl_program_name == "bake") {
117  /* Note: get_build_options for bake is only requested when baking is enabled.
118  * displace and background are always requested.
119  * `__SPLIT_KERNEL__` must not be present in the compile directives for bake */
120  DeviceRequestedFeatures features(requested_features);
121  enable_default_features(features);
122  features.use_denoising = false;
123  features.use_object_motion = false;
124  features.use_camera_motion = false;
125  features.use_hair = true;
126  features.use_subsurface = true;
128  features.nodes_features = NODE_FEATURE_ALL;
129  features.use_integrator_branched = false;
130  return features.get_build_options();
131  }
132  else if (opencl_program_name == "displace") {
133  /* As displacement does not use any nodes from the Shading group (eg BSDF).
134  * We disable all features that are related to shading. */
135  DeviceRequestedFeatures features(requested_features);
136  enable_default_features(features);
137  features.use_denoising = false;
138  features.use_object_motion = false;
139  features.use_camera_motion = false;
140  features.use_baking = false;
141  features.use_transparent = false;
142  features.use_shadow_tricks = false;
143  features.use_subsurface = false;
144  features.use_volume = false;
145  features.nodes_features &= ~NODE_FEATURE_VOLUME;
146  features.use_denoising = false;
147  features.use_principled = false;
148  features.use_integrator_branched = false;
149  return features.get_build_options();
150  }
151  else if (opencl_program_name == "background") {
152  /* Background uses Background shading
153  * It is save to disable shadow features, subsurface and volumetric. */
154  DeviceRequestedFeatures features(requested_features);
155  enable_default_features(features);
156  features.use_baking = false;
157  features.use_object_motion = false;
158  features.use_camera_motion = false;
159  features.use_transparent = false;
160  features.use_shadow_tricks = false;
161  features.use_denoising = false;
162  /* NOTE: currently possible to use surface nodes like `Hair Info`, `Bump` node.
163  * Perhaps we should remove them in UI as it does not make any sense when
164  * rendering background. */
165  features.nodes_features &= ~NODE_FEATURE_VOLUME;
166  features.use_subsurface = false;
167  features.use_volume = false;
168  features.use_shader_raytrace = false;
169  features.use_patch_evaluation = false;
170  features.use_integrator_branched = false;
171  return features.get_build_options();
172  }
173 
174  string build_options = "-D__SPLIT_KERNEL__ ";
175  /* Set compute device build option. */
176  cl_device_type device_type;
177  OpenCLInfo::get_device_type(this->cdDevice, &device_type, &this->ciErr);
178  assert(this->ciErr == CL_SUCCESS);
179  if (device_type == CL_DEVICE_TYPE_GPU) {
180  build_options += "-D__COMPUTE_DEVICE_GPU__ ";
181  }
182 
183  DeviceRequestedFeatures nofeatures;
184  enable_default_features(nofeatures);
185 
186  /* Add program specific optimized compile directives */
187  if (opencl_program_name == "split_do_volume" && !requested_features.use_volume) {
188  build_options += nofeatures.get_build_options();
189  }
190  else {
191  DeviceRequestedFeatures features(requested_features);
192  enable_default_features(features);
193 
194  /* Always turn off baking at this point. Baking is only useful when building the bake kernel.
195  * this also makes sure that the kernels that are build during baking can be reused
196  * when not doing any baking. */
197  features.use_baking = false;
198 
199  /* Do not vary on shaders when program doesn't do any shading.
200  * We have bundled them in a single program. */
201  if (opencl_program_name == "split_bundle") {
202  features.max_nodes_group = 0;
203  features.nodes_features = 0;
204  features.use_shader_raytrace = false;
205  }
206 
207  /* No specific settings, just add the regular ones */
208  build_options += features.get_build_options();
209  }
210 
211  return build_options;
212 }
213 
214 OpenCLDevice::OpenCLSplitPrograms::OpenCLSplitPrograms(OpenCLDevice *device_)
215 {
216  device = device_;
217 }
218 
219 OpenCLDevice::OpenCLSplitPrograms::~OpenCLSplitPrograms()
220 {
221  program_split.release();
222  program_lamp_emission.release();
223  program_do_volume.release();
224  program_indirect_background.release();
225  program_shader_eval.release();
226  program_holdout_emission_blurring_pathtermination_ao.release();
227  program_subsurface_scatter.release();
228  program_direct_lighting.release();
229  program_shadow_blocked_ao.release();
230  program_shadow_blocked_dl.release();
231 }
232 
233 void OpenCLDevice::OpenCLSplitPrograms::load_kernels(
234  vector<OpenCLProgram *> &programs, const DeviceRequestedFeatures &requested_features)
235 {
236  if (!requested_features.use_baking) {
237 # define ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(kernel_name) \
238  program_split.add_kernel(ustring("path_trace_" #kernel_name));
239 # define ADD_SPLIT_KERNEL_PROGRAM(kernel_name) \
240  const string program_name_##kernel_name = "split_" #kernel_name; \
241  program_##kernel_name = OpenCLDevice::OpenCLProgram( \
242  device, \
243  program_name_##kernel_name, \
244  "kernel_" #kernel_name ".cl", \
245  device->get_build_options(requested_features, program_name_##kernel_name)); \
246  program_##kernel_name.add_kernel(ustring("path_trace_" #kernel_name)); \
247  programs.push_back(&program_##kernel_name);
248 
249  /* Ordered with most complex kernels first, to reduce overall compile time. */
250  ADD_SPLIT_KERNEL_PROGRAM(subsurface_scatter);
251  ADD_SPLIT_KERNEL_PROGRAM(direct_lighting);
252  ADD_SPLIT_KERNEL_PROGRAM(indirect_background);
253  if (requested_features.use_volume) {
254  ADD_SPLIT_KERNEL_PROGRAM(do_volume);
255  }
256  ADD_SPLIT_KERNEL_PROGRAM(shader_eval);
257  ADD_SPLIT_KERNEL_PROGRAM(lamp_emission);
258  ADD_SPLIT_KERNEL_PROGRAM(holdout_emission_blurring_pathtermination_ao);
259  ADD_SPLIT_KERNEL_PROGRAM(shadow_blocked_dl);
260  ADD_SPLIT_KERNEL_PROGRAM(shadow_blocked_ao);
261 
262  /* Quick kernels bundled in a single program to reduce overhead of starting
263  * Blender processes. */
264  program_split = OpenCLDevice::OpenCLProgram(
265  device,
266  "split_bundle",
267  "kernel_split_bundle.cl",
268  device->get_build_options(requested_features, "split_bundle"));
269 
270  ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(data_init);
271  ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(state_buffer_size);
272  ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(path_init);
273  ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(scene_intersect);
274  ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(queue_enqueue);
275  ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(shader_setup);
276  ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(shader_sort);
277  ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(enqueue_inactive);
278  ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(next_iteration_setup);
279  ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(indirect_subsurface);
280  ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(buffer_update);
281  ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(adaptive_stopping);
282  ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(adaptive_filter_x);
283  ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(adaptive_filter_y);
284  ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(adaptive_adjust_samples);
285  programs.push_back(&program_split);
286 
287 # undef ADD_SPLIT_KERNEL_PROGRAM
288 # undef ADD_SPLIT_KERNEL_BUNDLE_PROGRAM
289  }
290 }
291 
292 namespace {
293 
294 /* Copy dummy KernelGlobals related to OpenCL from kernel_globals.h to
295  * fetch its size.
296  */
297 typedef struct KernelGlobalsDummy {
299  ccl_global char *buffers[8];
300 
301 # define KERNEL_TEX(type, name) TextureInfo name;
302 # include "kernel/kernel_textures.h"
303 # undef KERNEL_TEX
304  SplitData split_data;
305  SplitParams split_param_data;
306 } KernelGlobalsDummy;
307 
308 } // namespace
309 
310 struct CachedSplitMemory {
311  int id;
312  device_memory *split_data;
318 };
319 
320 class OpenCLSplitKernelFunction : public SplitKernelFunction {
321  public:
322  OpenCLDevice *device;
323  OpenCLDevice::OpenCLProgram program;
324  CachedSplitMemory &cached_memory;
325  int cached_id;
326 
327  OpenCLSplitKernelFunction(OpenCLDevice *device, CachedSplitMemory &cached_memory)
328  : device(device), cached_memory(cached_memory), cached_id(cached_memory.id - 1)
329  {
330  }
331 
332  ~OpenCLSplitKernelFunction()
333  {
334  program.release();
335  }
336 
337  virtual bool enqueue(const KernelDimensions &dim, device_memory &kg, device_memory &data)
338  {
339  if (cached_id != cached_memory.id) {
340  cl_uint start_arg_index = device->kernel_set_args(
341  program(), 0, kg, data, *cached_memory.split_data, *cached_memory.ray_state);
342 
343  device->set_kernel_arg_buffers(program(), &start_arg_index);
344 
345  start_arg_index += device->kernel_set_args(program(),
346  start_arg_index,
347  *cached_memory.queue_index,
348  *cached_memory.use_queues_flag,
349  *cached_memory.work_pools,
350  *cached_memory.buffer);
351 
352  cached_id = cached_memory.id;
353  }
354 
355  device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
356  program(),
357  2,
358  NULL,
359  dim.global_size,
360  dim.local_size,
361  0,
362  NULL,
363  NULL);
364 
365  device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
366 
367  if (device->ciErr != CL_SUCCESS) {
368  string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
369  clewErrorString(device->ciErr));
370  device->opencl_error(message);
371  return false;
372  }
373 
374  return true;
375  }
376 };
377 
378 class OpenCLSplitKernel : public DeviceSplitKernel {
379  OpenCLDevice *device;
380  CachedSplitMemory cached_memory;
381 
382  public:
383  explicit OpenCLSplitKernel(OpenCLDevice *device) : DeviceSplitKernel(device), device(device)
384  {
385  }
386 
388  const string &kernel_name, const DeviceRequestedFeatures &requested_features)
389  {
390  OpenCLSplitKernelFunction *kernel = new OpenCLSplitKernelFunction(device, cached_memory);
391 
392  const string program_name = device->get_opencl_program_name(kernel_name);
393  kernel->program = OpenCLDevice::OpenCLProgram(
394  device,
395  program_name,
396  device->get_opencl_program_filename(kernel_name),
397  device->get_build_options(requested_features, program_name));
398 
399  kernel->program.add_kernel(ustring("path_trace_" + kernel_name));
400  kernel->program.load();
401 
402  if (!kernel->program.is_loaded()) {
403  delete kernel;
404  return NULL;
405  }
406 
407  return kernel;
408  }
409 
410  virtual uint64_t state_buffer_size(device_memory &kg, device_memory &data, size_t num_threads)
411  {
412  device_vector<uint64_t> size_buffer(device, "size_buffer", MEM_READ_WRITE);
413  size_buffer.alloc(1);
414  size_buffer.zero_to_device();
415 
416  uint threads = num_threads;
417  OpenCLDevice::OpenCLSplitPrograms *programs = device->get_split_programs();
418  cl_kernel kernel_state_buffer_size = programs->program_split(
419  ustring("path_trace_state_buffer_size"));
420  device->kernel_set_args(kernel_state_buffer_size, 0, kg, data, threads, size_buffer);
421 
422  size_t global_size = 64;
423  device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
424  kernel_state_buffer_size,
425  1,
426  NULL,
427  &global_size,
428  NULL,
429  0,
430  NULL,
431  NULL);
432 
433  device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
434 
435  size_buffer.copy_from_device(0, 1, 1);
436  size_t size = size_buffer[0];
437  size_buffer.free();
438 
439  if (device->ciErr != CL_SUCCESS) {
440  string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
441  clewErrorString(device->ciErr));
442  device->opencl_error(message);
443  return 0;
444  }
445 
446  return size;
447  }
448 
449  virtual bool enqueue_split_kernel_data_init(const KernelDimensions &dim,
450  RenderTile &rtile,
451  int num_global_elements,
452  device_memory &kernel_globals,
454  device_memory &split_data,
458  device_memory &work_pool_wgs)
459  {
460  cl_int dQueue_size = dim.global_size[0] * dim.global_size[1];
461 
462  /* Set the range of samples to be processed for every ray in
463  * path-regeneration logic.
464  */
465  cl_int start_sample = rtile.start_sample;
466  cl_int end_sample = rtile.start_sample + rtile.num_samples;
467 
468  OpenCLDevice::OpenCLSplitPrograms *programs = device->get_split_programs();
469  cl_kernel kernel_data_init = programs->program_split(ustring("path_trace_data_init"));
470 
471  cl_uint start_arg_index = device->kernel_set_args(kernel_data_init,
472  0,
473  kernel_globals,
474  kernel_data,
475  split_data,
476  num_global_elements,
477  ray_state);
478 
479  device->set_kernel_arg_buffers(kernel_data_init, &start_arg_index);
480 
481  start_arg_index += device->kernel_set_args(kernel_data_init,
482  start_arg_index,
483  start_sample,
484  end_sample,
485  rtile.x,
486  rtile.y,
487  rtile.w,
488  rtile.h,
489  rtile.offset,
490  rtile.stride,
491  queue_index,
492  dQueue_size,
494  work_pool_wgs,
495  rtile.num_samples,
496  rtile.buffer);
497 
498  /* Enqueue ckPathTraceKernel_data_init kernel. */
499  device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
501  2,
502  NULL,
503  dim.global_size,
504  dim.local_size,
505  0,
506  NULL,
507  NULL);
508 
509  device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
510 
511  if (device->ciErr != CL_SUCCESS) {
512  string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
513  clewErrorString(device->ciErr));
514  device->opencl_error(message);
515  return false;
516  }
517 
518  cached_memory.split_data = &split_data;
519  cached_memory.ray_state = &ray_state;
520  cached_memory.queue_index = &queue_index;
521  cached_memory.use_queues_flag = &use_queues_flag;
522  cached_memory.work_pools = &work_pool_wgs;
523  cached_memory.buffer = &rtile.buffer;
524  cached_memory.id++;
525 
526  return true;
527  }
528 
529  virtual int2 split_kernel_local_size()
530  {
531  return make_int2(64, 1);
532  }
533 
536  DeviceTask & /*task*/)
537  {
538  cl_device_type type = OpenCLInfo::get_device_type(device->cdDevice);
539  /* Use small global size on CPU devices as it seems to be much faster. */
540  if (type == CL_DEVICE_TYPE_CPU) {
541  VLOG(1) << "Global size: (64, 64).";
542  return make_int2(64, 64);
543  }
544 
545  cl_ulong max_buffer_size;
546  clGetDeviceInfo(
547  device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_buffer_size, NULL);
548 
549  if (DebugFlags().opencl.mem_limit) {
550  max_buffer_size = min(max_buffer_size,
551  cl_ulong(DebugFlags().opencl.mem_limit - device->stats.mem_used));
552  }
553 
554  VLOG(1) << "Maximum device allocation size: " << string_human_readable_number(max_buffer_size)
555  << " bytes. (" << string_human_readable_size(max_buffer_size) << ").";
556 
557  /* Limit to 2gb, as we shouldn't need more than that and some devices may support much more. */
558  max_buffer_size = min(max_buffer_size / 2, (cl_ulong)2l * 1024 * 1024 * 1024);
559 
560  size_t num_elements = max_elements_for_max_buffer_size(kg, data, max_buffer_size);
561  int2 global_size = make_int2(max(round_down((int)sqrt(num_elements), 64), 64),
562  (int)sqrt(num_elements));
563 
564  if (device->info.description.find("Intel") != string::npos) {
565  global_size = make_int2(min(512, global_size.x), min(512, global_size.y));
566  }
567 
568  VLOG(1) << "Global size: " << global_size << ".";
569  return global_size;
570  }
571 };
572 
573 bool OpenCLDevice::opencl_error(cl_int err)
574 {
575  if (err != CL_SUCCESS) {
576  string message = string_printf("OpenCL error (%d): %s", err, clewErrorString(err));
577  if (error_msg == "")
578  error_msg = message;
579  fprintf(stderr, "%s\n", message.c_str());
580  return true;
581  }
582 
583  return false;
584 }
585 
586 void OpenCLDevice::opencl_error(const string &message)
587 {
588  if (error_msg == "")
589  error_msg = message;
590  fprintf(stderr, "%s\n", message.c_str());
591 }
592 
593 void OpenCLDevice::opencl_assert_err(cl_int err, const char *where)
594 {
595  if (err != CL_SUCCESS) {
596  string message = string_printf(
597  "OpenCL error (%d): %s in %s", err, clewErrorString(err), where);
598  if (error_msg == "")
599  error_msg = message;
600  fprintf(stderr, "%s\n", message.c_str());
601 # ifndef NDEBUG
602  abort();
603 # endif
604  }
605 }
606 
607 OpenCLDevice::OpenCLDevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
608  : Device(info, stats, profiler, background),
609  load_kernel_num_compiling(0),
610  kernel_programs(this),
611  memory_manager(this),
612  texture_info(this, "__texture_info", MEM_GLOBAL)
613 {
614  cpPlatform = NULL;
615  cdDevice = NULL;
616  cxContext = NULL;
617  cqCommandQueue = NULL;
618  device_initialized = false;
619  textures_need_update = true;
620 
621  vector<OpenCLPlatformDevice> usable_devices;
622  OpenCLInfo::get_usable_devices(&usable_devices);
623  if (usable_devices.size() == 0) {
624  opencl_error("OpenCL: no devices found.");
625  return;
626  }
627  assert(info.num < usable_devices.size());
628  OpenCLPlatformDevice &platform_device = usable_devices[info.num];
629  device_num = info.num;
630  cpPlatform = platform_device.platform_id;
631  cdDevice = platform_device.device_id;
632  platform_name = platform_device.platform_name;
633  device_name = platform_device.device_name;
634  VLOG(2) << "Creating new Cycles device for OpenCL platform " << platform_name << ", device "
635  << device_name << ".";
636 
637  {
638  /* try to use cached context */
639  thread_scoped_lock cache_locker;
640  cxContext = OpenCLCache::get_context(cpPlatform, cdDevice, cache_locker);
641 
642  if (cxContext == NULL) {
643  /* create context properties array to specify platform */
644  const cl_context_properties context_props[] = {
645  CL_CONTEXT_PLATFORM, (cl_context_properties)cpPlatform, 0, 0};
646 
647  /* create context */
648  cxContext = clCreateContext(
649  context_props, 1, &cdDevice, context_notify_callback, cdDevice, &ciErr);
650 
651  if (opencl_error(ciErr)) {
652  opencl_error("OpenCL: clCreateContext failed");
653  return;
654  }
655 
656  /* cache it */
657  OpenCLCache::store_context(cpPlatform, cdDevice, cxContext, cache_locker);
658  }
659  }
660 
661  cqCommandQueue = clCreateCommandQueue(cxContext, cdDevice, 0, &ciErr);
662  if (opencl_error(ciErr)) {
663  opencl_error("OpenCL: Error creating command queue");
664  return;
665  }
666 
667  /* Allocate this right away so that texture_info
668  * is placed at offset 0 in the device memory buffers. */
669  texture_info.resize(1);
670  memory_manager.alloc("texture_info", texture_info);
671 
672  device_initialized = true;
673 
674  split_kernel = new OpenCLSplitKernel(this);
675 }
676 
677 OpenCLDevice::~OpenCLDevice()
678 {
679  task_pool.cancel();
680  load_required_kernel_task_pool.cancel();
681  load_kernel_task_pool.cancel();
682 
683  memory_manager.free();
684 
685  ConstMemMap::iterator mt;
686  for (mt = const_mem_map.begin(); mt != const_mem_map.end(); mt++) {
687  delete mt->second;
688  }
689 
690  base_program.release();
691  bake_program.release();
692  displace_program.release();
693  background_program.release();
694  denoising_program.release();
695 
696  if (cqCommandQueue)
697  clReleaseCommandQueue(cqCommandQueue);
698  if (cxContext)
699  clReleaseContext(cxContext);
700 
701  delete split_kernel;
702 }
703 
704 void CL_CALLBACK OpenCLDevice::context_notify_callback(const char *err_info,
705  const void * /*private_info*/,
706  size_t /*cb*/,
707  void *user_data)
708 {
709  string device_name = OpenCLInfo::get_device_name((cl_device_id)user_data);
710  fprintf(stderr, "OpenCL error (%s): %s\n", device_name.c_str(), err_info);
711 }
712 
713 bool OpenCLDevice::opencl_version_check()
714 {
715  string error;
716  if (!OpenCLInfo::platform_version_check(cpPlatform, &error)) {
717  opencl_error(error);
718  return false;
719  }
720  if (!OpenCLInfo::device_version_check(cdDevice, &error)) {
721  opencl_error(error);
722  return false;
723  }
724  return true;
725 }
726 
727 string OpenCLDevice::device_md5_hash(string kernel_custom_build_options)
728 {
729  MD5Hash md5;
730  char version[256], driver[256], name[256], vendor[256];
731 
732  clGetPlatformInfo(cpPlatform, CL_PLATFORM_VENDOR, sizeof(vendor), &vendor, NULL);
733  clGetDeviceInfo(cdDevice, CL_DEVICE_VERSION, sizeof(version), &version, NULL);
734  clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(name), &name, NULL);
735  clGetDeviceInfo(cdDevice, CL_DRIVER_VERSION, sizeof(driver), &driver, NULL);
736 
737  md5.append((uint8_t *)vendor, strlen(vendor));
738  md5.append((uint8_t *)version, strlen(version));
739  md5.append((uint8_t *)name, strlen(name));
740  md5.append((uint8_t *)driver, strlen(driver));
741 
742  string options = kernel_build_options();
743  options += kernel_custom_build_options;
744  md5.append((uint8_t *)options.c_str(), options.size());
745 
746  return md5.get_hex();
747 }
748 
749 bool OpenCLDevice::load_kernels(const DeviceRequestedFeatures &requested_features)
750 {
751  VLOG(2) << "Loading kernels for platform " << platform_name << ", device " << device_name << ".";
752  /* Verify if device was initialized. */
753  if (!device_initialized) {
754  fprintf(stderr, "OpenCL: failed to initialize device.\n");
755  return false;
756  }
757 
758  /* Verify we have right opencl version. */
759  if (!opencl_version_check())
760  return false;
761 
762  load_required_kernels(requested_features);
763 
764  vector<OpenCLProgram *> programs;
765  kernel_programs.load_kernels(programs, requested_features);
766 
767  if (!requested_features.use_baking && requested_features.use_denoising) {
768  denoising_program = OpenCLProgram(
769  this, "denoising", "filter.cl", get_build_options(requested_features, "denoising"));
770  denoising_program.add_kernel(ustring("filter_divide_shadow"));
771  denoising_program.add_kernel(ustring("filter_get_feature"));
772  denoising_program.add_kernel(ustring("filter_write_feature"));
773  denoising_program.add_kernel(ustring("filter_detect_outliers"));
774  denoising_program.add_kernel(ustring("filter_combine_halves"));
775  denoising_program.add_kernel(ustring("filter_construct_transform"));
776  denoising_program.add_kernel(ustring("filter_nlm_calc_difference"));
777  denoising_program.add_kernel(ustring("filter_nlm_blur"));
778  denoising_program.add_kernel(ustring("filter_nlm_calc_weight"));
779  denoising_program.add_kernel(ustring("filter_nlm_update_output"));
780  denoising_program.add_kernel(ustring("filter_nlm_normalize"));
781  denoising_program.add_kernel(ustring("filter_nlm_construct_gramian"));
782  denoising_program.add_kernel(ustring("filter_finalize"));
783  programs.push_back(&denoising_program);
784  }
785 
786  load_required_kernel_task_pool.wait_work();
787 
788  /* Parallel compilation of Cycles kernels, this launches multiple
789  * processes to workaround OpenCL frameworks serializing the calls
790  * internally within a single process. */
791  foreach (OpenCLProgram *program, programs) {
792  if (!program->load()) {
793  load_kernel_num_compiling++;
794  load_kernel_task_pool.push([=] {
795  program->compile();
796  load_kernel_num_compiling--;
797  });
798  }
799  }
800  return true;
801 }
802 
803 void OpenCLDevice::load_required_kernels(const DeviceRequestedFeatures &requested_features)
804 {
805  vector<OpenCLProgram *> programs;
806  base_program = OpenCLProgram(
807  this, "base", "kernel_base.cl", get_build_options(requested_features, "base"));
808  base_program.add_kernel(ustring("convert_to_byte"));
809  base_program.add_kernel(ustring("convert_to_half_float"));
810  base_program.add_kernel(ustring("zero_buffer"));
811  programs.push_back(&base_program);
812 
813  if (requested_features.use_true_displacement) {
814  displace_program = OpenCLProgram(
815  this, "displace", "kernel_displace.cl", get_build_options(requested_features, "displace"));
816  displace_program.add_kernel(ustring("displace"));
817  programs.push_back(&displace_program);
818  }
819 
820  if (requested_features.use_background_light) {
821  background_program = OpenCLProgram(this,
822  "background",
823  "kernel_background.cl",
824  get_build_options(requested_features, "background"));
825  background_program.add_kernel(ustring("background"));
826  programs.push_back(&background_program);
827  }
828 
829  if (requested_features.use_baking) {
830  bake_program = OpenCLProgram(
831  this, "bake", "kernel_bake.cl", get_build_options(requested_features, "bake"));
832  bake_program.add_kernel(ustring("bake"));
833  programs.push_back(&bake_program);
834  }
835 
836  foreach (OpenCLProgram *program, programs) {
837  if (!program->load()) {
838  load_required_kernel_task_pool.push(function_bind(&OpenCLProgram::compile, program));
839  }
840  }
841 }
842 
843 bool OpenCLDevice::wait_for_availability(const DeviceRequestedFeatures &requested_features)
844 {
845  if (requested_features.use_baking) {
846  /* For baking, kernels have already been loaded in load_required_kernels(). */
847  return true;
848  }
849 
850  load_kernel_task_pool.wait_work();
851  return split_kernel->load_kernels(requested_features);
852 }
853 
854 OpenCLDevice::OpenCLSplitPrograms *OpenCLDevice::get_split_programs()
855 {
856  return &kernel_programs;
857 }
858 
859 DeviceKernelStatus OpenCLDevice::get_active_kernel_switch_state()
860 {
862 }
863 
864 void OpenCLDevice::mem_alloc(device_memory &mem)
865 {
866  if (mem.name) {
867  VLOG(1) << "Buffer allocate: " << mem.name << ", "
868  << string_human_readable_number(mem.memory_size()) << " bytes. ("
869  << string_human_readable_size(mem.memory_size()) << ")";
870  }
871 
872  size_t size = mem.memory_size();
873 
874  /* check there is enough memory available for the allocation */
875  cl_ulong max_alloc_size = 0;
876  clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_alloc_size, NULL);
877 
878  if (DebugFlags().opencl.mem_limit) {
879  max_alloc_size = min(max_alloc_size, cl_ulong(DebugFlags().opencl.mem_limit - stats.mem_used));
880  }
881 
882  if (size > max_alloc_size) {
883  string error = "Scene too complex to fit in available memory.";
884  if (mem.name != NULL) {
885  error += string_printf(" (allocating buffer %s failed.)", mem.name);
886  }
887  set_error(error);
888 
889  return;
890  }
891 
892  cl_mem_flags mem_flag;
893  void *mem_ptr = NULL;
894 
895  if (mem.type == MEM_READ_ONLY || mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL)
896  mem_flag = CL_MEM_READ_ONLY;
897  else
898  mem_flag = CL_MEM_READ_WRITE;
899 
900  /* Zero-size allocation might be invoked by render, but not really
901  * supported by OpenCL. Using NULL as device pointer also doesn't really
902  * work for some reason, so for the time being we'll use special case
903  * will null_mem buffer.
904  */
905  if (size != 0) {
906  mem.device_pointer = (device_ptr)clCreateBuffer(cxContext, mem_flag, size, mem_ptr, &ciErr);
907  opencl_assert_err(ciErr, "clCreateBuffer");
908  }
909  else {
910  mem.device_pointer = 0;
911  }
912 
913  stats.mem_alloc(size);
914  mem.device_size = size;
915 }
916 
917 void OpenCLDevice::mem_copy_to(device_memory &mem)
918 {
919  if (mem.type == MEM_GLOBAL) {
920  global_free(mem);
921  global_alloc(mem);
922  }
923  else if (mem.type == MEM_TEXTURE) {
924  tex_free((device_texture &)mem);
925  tex_alloc((device_texture &)mem);
926  }
927  else {
928  if (!mem.device_pointer) {
929  mem_alloc(mem);
930  }
931 
932  /* this is blocking */
933  size_t size = mem.memory_size();
934  if (size != 0) {
935  opencl_assert(clEnqueueWriteBuffer(cqCommandQueue,
936  CL_MEM_PTR(mem.device_pointer),
937  CL_TRUE,
938  0,
939  size,
940  mem.host_pointer,
941  0,
942  NULL,
943  NULL));
944  }
945  }
946 }
947 
948 void OpenCLDevice::mem_copy_from(device_memory &mem, int y, int w, int h, int elem)
949 {
950  size_t offset = elem * y * w;
951  size_t size = elem * w * h;
952  assert(size != 0);
953  opencl_assert(clEnqueueReadBuffer(cqCommandQueue,
954  CL_MEM_PTR(mem.device_pointer),
955  CL_TRUE,
956  offset,
957  size,
958  (uchar *)mem.host_pointer + offset,
959  0,
960  NULL,
961  NULL));
962 }
963 
964 void OpenCLDevice::mem_zero_kernel(device_ptr mem, size_t size)
965 {
966  base_program.wait_for_availability();
967  cl_kernel ckZeroBuffer = base_program(ustring("zero_buffer"));
968 
969  size_t global_size[] = {1024, 1024};
970  size_t num_threads = global_size[0] * global_size[1];
971 
972  cl_mem d_buffer = CL_MEM_PTR(mem);
973  cl_ulong d_offset = 0;
974  cl_ulong d_size = 0;
975 
976  while (d_offset < size) {
977  d_size = std::min<cl_ulong>(num_threads * sizeof(float4), size - d_offset);
978 
979  kernel_set_args(ckZeroBuffer, 0, d_buffer, d_size, d_offset);
980 
981  ciErr = clEnqueueNDRangeKernel(
982  cqCommandQueue, ckZeroBuffer, 2, NULL, global_size, NULL, 0, NULL, NULL);
983  opencl_assert_err(ciErr, "clEnqueueNDRangeKernel");
984 
985  d_offset += d_size;
986  }
987 }
988 
989 void OpenCLDevice::mem_zero(device_memory &mem)
990 {
991  if (!mem.device_pointer) {
992  mem_alloc(mem);
993  }
994 
995  if (mem.device_pointer) {
996  if (base_program.is_loaded()) {
997  mem_zero_kernel(mem.device_pointer, mem.memory_size());
998  }
999 
1000  if (mem.host_pointer) {
1001  memset(mem.host_pointer, 0, mem.memory_size());
1002  }
1003 
1004  if (!base_program.is_loaded()) {
1005  void *zero = mem.host_pointer;
1006 
1007  if (!mem.host_pointer) {
1008  zero = util_aligned_malloc(mem.memory_size(), 16);
1009  memset(zero, 0, mem.memory_size());
1010  }
1011 
1012  opencl_assert(clEnqueueWriteBuffer(cqCommandQueue,
1013  CL_MEM_PTR(mem.device_pointer),
1014  CL_TRUE,
1015  0,
1016  mem.memory_size(),
1017  zero,
1018  0,
1019  NULL,
1020  NULL));
1021 
1022  if (!mem.host_pointer) {
1023  util_aligned_free(zero);
1024  }
1025  }
1026  }
1027 }
1028 
1029 void OpenCLDevice::mem_free(device_memory &mem)
1030 {
1031  if (mem.type == MEM_GLOBAL) {
1032  global_free(mem);
1033  }
1034  else if (mem.type == MEM_TEXTURE) {
1035  tex_free((device_texture &)mem);
1036  }
1037  else {
1038  if (mem.device_pointer) {
1039  if (mem.device_pointer != 0) {
1040  opencl_assert(clReleaseMemObject(CL_MEM_PTR(mem.device_pointer)));
1041  }
1042  mem.device_pointer = 0;
1043 
1044  stats.mem_free(mem.device_size);
1045  mem.device_size = 0;
1046  }
1047  }
1048 }
1049 
1050 int OpenCLDevice::mem_sub_ptr_alignment()
1051 {
1052  return OpenCLInfo::mem_sub_ptr_alignment(cdDevice);
1053 }
1054 
1055 device_ptr OpenCLDevice::mem_alloc_sub_ptr(device_memory &mem, int offset, int size)
1056 {
1057  cl_mem_flags mem_flag;
1058  if (mem.type == MEM_READ_ONLY || mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL)
1059  mem_flag = CL_MEM_READ_ONLY;
1060  else
1061  mem_flag = CL_MEM_READ_WRITE;
1062 
1063  cl_buffer_region info;
1064  info.origin = mem.memory_elements_size(offset);
1065  info.size = mem.memory_elements_size(size);
1066 
1067  device_ptr sub_buf = (device_ptr)clCreateSubBuffer(
1068  CL_MEM_PTR(mem.device_pointer), mem_flag, CL_BUFFER_CREATE_TYPE_REGION, &info, &ciErr);
1069  opencl_assert_err(ciErr, "clCreateSubBuffer");
1070  return sub_buf;
1071 }
1072 
1073 void OpenCLDevice::mem_free_sub_ptr(device_ptr device_pointer)
1074 {
1075  if (device_pointer != 0) {
1076  opencl_assert(clReleaseMemObject(CL_MEM_PTR(device_pointer)));
1077  }
1078 }
1079 
1080 void OpenCLDevice::const_copy_to(const char *name, void *host, size_t size)
1081 {
1082  ConstMemMap::iterator i = const_mem_map.find(name);
1084 
1085  if (i == const_mem_map.end()) {
1086  data = new device_vector<uchar>(this, name, MEM_READ_ONLY);
1087  data->alloc(size);
1088  const_mem_map.insert(ConstMemMap::value_type(name, data));
1089  }
1090  else {
1091  data = i->second;
1092  }
1093 
1094  memcpy(data->data(), host, size);
1095  data->copy_to_device();
1096 }
1097 
1098 void OpenCLDevice::global_alloc(device_memory &mem)
1099 {
1100  VLOG(1) << "Global memory allocate: " << mem.name << ", "
1101  << string_human_readable_number(mem.memory_size()) << " bytes. ("
1102  << string_human_readable_size(mem.memory_size()) << ")";
1103 
1104  memory_manager.alloc(mem.name, mem);
1105  /* Set the pointer to non-null to keep code that inspects its value from thinking its
1106  * unallocated. */
1107  mem.device_pointer = 1;
1108  textures[mem.name] = &mem;
1109  textures_need_update = true;
1110 }
1111 
1112 void OpenCLDevice::global_free(device_memory &mem)
1113 {
1114  if (mem.device_pointer) {
1115  mem.device_pointer = 0;
1116 
1117  if (memory_manager.free(mem)) {
1118  textures_need_update = true;
1119  }
1120 
1121  foreach (TexturesMap::value_type &value, textures) {
1122  if (value.second == &mem) {
1123  textures.erase(value.first);
1124  break;
1125  }
1126  }
1127  }
1128 }
1129 
1130 void OpenCLDevice::tex_alloc(device_texture &mem)
1131 {
1132  VLOG(1) << "Texture allocate: " << mem.name << ", "
1133  << string_human_readable_number(mem.memory_size()) << " bytes. ("
1134  << string_human_readable_size(mem.memory_size()) << ")";
1135 
1136  memory_manager.alloc(mem.name, mem);
1137  /* Set the pointer to non-null to keep code that inspects its value from thinking its
1138  * unallocated. */
1139  mem.device_pointer = 1;
1140  textures[mem.name] = &mem;
1141  textures_need_update = true;
1142 }
1143 
1144 void OpenCLDevice::tex_free(device_texture &mem)
1145 {
1146  global_free(mem);
1147 }
1148 
1149 size_t OpenCLDevice::global_size_round_up(int group_size, int global_size)
1150 {
1151  int r = global_size % group_size;
1152  return global_size + ((r == 0) ? 0 : group_size - r);
1153 }
1154 
1155 void OpenCLDevice::enqueue_kernel(
1156  cl_kernel kernel, size_t w, size_t h, bool x_workgroups, size_t max_workgroup_size)
1157 {
1158  size_t workgroup_size, max_work_items[3];
1159 
1160  clGetKernelWorkGroupInfo(
1161  kernel, cdDevice, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL);
1162  clGetDeviceInfo(
1163  cdDevice, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * 3, max_work_items, NULL);
1164 
1165  if (max_workgroup_size > 0 && workgroup_size > max_workgroup_size) {
1166  workgroup_size = max_workgroup_size;
1167  }
1168 
1169  /* Try to divide evenly over 2 dimensions. */
1170  size_t local_size[2];
1171  if (x_workgroups) {
1172  local_size[0] = workgroup_size;
1173  local_size[1] = 1;
1174  }
1175  else {
1176  size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1);
1177  local_size[0] = local_size[1] = sqrt_workgroup_size;
1178  }
1179 
1180  /* Some implementations have max size 1 on 2nd dimension. */
1181  if (local_size[1] > max_work_items[1]) {
1182  local_size[0] = workgroup_size / max_work_items[1];
1183  local_size[1] = max_work_items[1];
1184  }
1185 
1186  size_t global_size[2] = {global_size_round_up(local_size[0], w),
1187  global_size_round_up(local_size[1], h)};
1188 
1189  /* Vertical size of 1 is coming from bake/shade kernels where we should
1190  * not round anything up because otherwise we'll either be doing too
1191  * much work per pixel (if we don't check global ID on Y axis) or will
1192  * be checking for global ID to always have Y of 0.
1193  */
1194  if (h == 1) {
1195  global_size[h] = 1;
1196  }
1197 
1198  /* run kernel */
1199  opencl_assert(
1200  clEnqueueNDRangeKernel(cqCommandQueue, kernel, 2, NULL, global_size, NULL, 0, NULL, NULL));
1201  opencl_assert(clFlush(cqCommandQueue));
1202 }
1203 
1204 void OpenCLDevice::set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name)
1205 {
1206  cl_mem ptr;
1207 
1208  MemMap::iterator i = mem_map.find(name);
1209  if (i != mem_map.end()) {
1210  ptr = CL_MEM_PTR(i->second);
1211  }
1212  else {
1213  ptr = 0;
1214  }
1215 
1216  opencl_assert(clSetKernelArg(kernel, (*narg)++, sizeof(ptr), (void *)&ptr));
1217 }
1218 
1219 void OpenCLDevice::set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg)
1220 {
1221  flush_texture_buffers();
1222 
1223  memory_manager.set_kernel_arg_buffers(kernel, narg);
1224 }
1225 
1226 void OpenCLDevice::flush_texture_buffers()
1227 {
1228  if (!textures_need_update) {
1229  return;
1230  }
1231  textures_need_update = false;
1232 
1233  /* Setup slots for textures. */
1234  int num_slots = 0;
1235 
1236  vector<texture_slot_t> texture_slots;
1237 
1238 # define KERNEL_TEX(type, name) \
1239  if (textures.find(#name) != textures.end()) { \
1240  texture_slots.push_back(texture_slot_t(#name, num_slots)); \
1241  } \
1242  num_slots++;
1243 # include "kernel/kernel_textures.h"
1244 
1245  int num_data_slots = num_slots;
1246 
1247  foreach (TexturesMap::value_type &tex, textures) {
1248  string name = tex.first;
1249  device_memory *mem = tex.second;
1250 
1251  if (mem->type == MEM_TEXTURE) {
1252  const uint id = ((device_texture *)mem)->slot;
1253  texture_slots.push_back(texture_slot_t(name, num_data_slots + id));
1254  num_slots = max(num_slots, num_data_slots + id + 1);
1255  }
1256  }
1257 
1258  /* Realloc texture descriptors buffer. */
1259  memory_manager.free(texture_info);
1260  texture_info.resize(num_slots);
1261  memory_manager.alloc("texture_info", texture_info);
1262 
1263  /* Fill in descriptors */
1264  foreach (texture_slot_t &slot, texture_slots) {
1265  device_memory *mem = textures[slot.name];
1266  TextureInfo &info = texture_info[slot.slot];
1267 
1268  MemoryManager::BufferDescriptor desc = memory_manager.get_descriptor(slot.name);
1269 
1270  if (mem->type == MEM_TEXTURE) {
1271  info = ((device_texture *)mem)->info;
1272  }
1273  else {
1274  memset(&info, 0, sizeof(TextureInfo));
1275  }
1276 
1277  info.data = desc.offset;
1278  info.cl_buffer = desc.device_buffer;
1279  }
1280 
1281  /* Force write of descriptors. */
1282  memory_manager.free(texture_info);
1283  memory_manager.alloc("texture_info", texture_info);
1284 }
1285 
1286 void OpenCLDevice::thread_run(DeviceTask &task)
1287 {
1288  flush_texture_buffers();
1289 
1290  if (task.type == DeviceTask::RENDER) {
1291  RenderTile tile;
1292  DenoisingTask denoising(this, task);
1293 
1294  /* Allocate buffer for kernel globals */
1295  device_only_memory<KernelGlobalsDummy> kgbuffer(this, "kernel_globals");
1296  kgbuffer.alloc_to_device(1);
1297 
1298  /* Keep rendering tiles until done. */
1299  while (task.acquire_tile(this, tile, task.tile_types)) {
1300  if (tile.task == RenderTile::PATH_TRACE) {
1301  assert(tile.task == RenderTile::PATH_TRACE);
1302  scoped_timer timer(&tile.buffers->render_time);
1303 
1304  split_kernel->path_trace(task, tile, kgbuffer, *const_mem_map["__data"]);
1305 
1306  /* Complete kernel execution before release tile. */
1307  /* This helps in multi-device render;
1308  * The device that reaches the critical-section function
1309  * release_tile waits (stalling other devices from entering
1310  * release_tile) for all kernels to complete. If device1 (a
1311  * slow-render device) reaches release_tile first then it would
1312  * stall device2 (a fast-render device) from proceeding to render
1313  * next tile.
1314  */
1315  clFinish(cqCommandQueue);
1316  }
1317  else if (tile.task == RenderTile::BAKE) {
1318  bake(task, tile);
1319  }
1320  else if (tile.task == RenderTile::DENOISE) {
1321  tile.sample = tile.start_sample + tile.num_samples;
1322  denoise(tile, denoising);
1323  task.update_progress(&tile, tile.w * tile.h);
1324  }
1325 
1326  task.release_tile(tile);
1327  }
1328 
1329  kgbuffer.free();
1330  }
1331  else if (task.type == DeviceTask::SHADER) {
1332  shader(task);
1333  }
1334  else if (task.type == DeviceTask::FILM_CONVERT) {
1335  film_convert(task, task.buffer, task.rgba_byte, task.rgba_half);
1336  }
1337  else if (task.type == DeviceTask::DENOISE_BUFFER) {
1338  RenderTile tile;
1339  tile.x = task.x;
1340  tile.y = task.y;
1341  tile.w = task.w;
1342  tile.h = task.h;
1343  tile.buffer = task.buffer;
1344  tile.sample = task.sample + task.num_samples;
1345  tile.num_samples = task.num_samples;
1346  tile.start_sample = task.sample;
1347  tile.offset = task.offset;
1348  tile.stride = task.stride;
1349  tile.buffers = task.buffers;
1350 
1351  DenoisingTask denoising(this, task);
1352  denoise(tile, denoising);
1353  task.update_progress(&tile, tile.w * tile.h);
1354  }
1355 }
1356 
1357 void OpenCLDevice::film_convert(DeviceTask &task,
1359  device_ptr rgba_byte,
1360  device_ptr rgba_half)
1361 {
1362  /* cast arguments to cl types */
1363  cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
1364  cl_mem d_rgba = (rgba_byte) ? CL_MEM_PTR(rgba_byte) : CL_MEM_PTR(rgba_half);
1365  cl_mem d_buffer = CL_MEM_PTR(buffer);
1366  cl_int d_x = task.x;
1367  cl_int d_y = task.y;
1368  cl_int d_w = task.w;
1369  cl_int d_h = task.h;
1370  cl_float d_sample_scale = 1.0f / (task.sample + 1);
1371  cl_int d_offset = task.offset;
1372  cl_int d_stride = task.stride;
1373 
1374  cl_kernel ckFilmConvertKernel = (rgba_byte) ? base_program(ustring("convert_to_byte")) :
1375  base_program(ustring("convert_to_half_float"));
1376 
1377  cl_uint start_arg_index = kernel_set_args(ckFilmConvertKernel, 0, d_data, d_rgba, d_buffer);
1378 
1379  set_kernel_arg_buffers(ckFilmConvertKernel, &start_arg_index);
1380 
1381  start_arg_index += kernel_set_args(ckFilmConvertKernel,
1382  start_arg_index,
1383  d_sample_scale,
1384  d_x,
1385  d_y,
1386  d_w,
1387  d_h,
1388  d_offset,
1389  d_stride);
1390 
1391  enqueue_kernel(ckFilmConvertKernel, d_w, d_h);
1392 }
1393 
1394 bool OpenCLDevice::denoising_non_local_means(device_ptr image_ptr,
1395  device_ptr guide_ptr,
1396  device_ptr variance_ptr,
1397  device_ptr out_ptr,
1399 {
1400  int stride = task->buffer.stride;
1401  int w = task->buffer.width;
1402  int h = task->buffer.h;
1403  int r = task->nlm_state.r;
1404  int f = task->nlm_state.f;
1405  float a = task->nlm_state.a;
1406  float k_2 = task->nlm_state.k_2;
1407 
1408  int pass_stride = task->buffer.pass_stride;
1409  int num_shifts = (2 * r + 1) * (2 * r + 1);
1410  int channel_offset = task->nlm_state.is_color ? task->buffer.pass_stride : 0;
1411 
1412  device_sub_ptr difference(task->buffer.temporary_mem, 0, pass_stride * num_shifts);
1413  device_sub_ptr blurDifference(
1414  task->buffer.temporary_mem, pass_stride * num_shifts, pass_stride * num_shifts);
1415  device_sub_ptr weightAccum(
1416  task->buffer.temporary_mem, 2 * pass_stride * num_shifts, pass_stride);
1417  cl_mem weightAccum_mem = CL_MEM_PTR(*weightAccum);
1418  cl_mem difference_mem = CL_MEM_PTR(*difference);
1419  cl_mem blurDifference_mem = CL_MEM_PTR(*blurDifference);
1420 
1421  cl_mem image_mem = CL_MEM_PTR(image_ptr);
1422  cl_mem guide_mem = CL_MEM_PTR(guide_ptr);
1423  cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
1424  cl_mem out_mem = CL_MEM_PTR(out_ptr);
1425  cl_mem scale_mem = NULL;
1426 
1427  mem_zero_kernel(*weightAccum, sizeof(float) * pass_stride);
1428  mem_zero_kernel(out_ptr, sizeof(float) * pass_stride);
1429 
1430  cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference"));
1431  cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur"));
1432  cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight"));
1433  cl_kernel ckNLMUpdateOutput = denoising_program(ustring("filter_nlm_update_output"));
1434  cl_kernel ckNLMNormalize = denoising_program(ustring("filter_nlm_normalize"));
1435 
1436  kernel_set_args(ckNLMCalcDifference,
1437  0,
1438  guide_mem,
1439  variance_mem,
1440  scale_mem,
1441  difference_mem,
1442  w,
1443  h,
1444  stride,
1445  pass_stride,
1446  r,
1447  channel_offset,
1448  0,
1449  a,
1450  k_2);
1451  kernel_set_args(
1452  ckNLMBlur, 0, difference_mem, blurDifference_mem, w, h, stride, pass_stride, r, f);
1453  kernel_set_args(
1454  ckNLMCalcWeight, 0, blurDifference_mem, difference_mem, w, h, stride, pass_stride, r, f);
1455  kernel_set_args(ckNLMUpdateOutput,
1456  0,
1457  blurDifference_mem,
1458  image_mem,
1459  out_mem,
1460  weightAccum_mem,
1461  w,
1462  h,
1463  stride,
1464  pass_stride,
1465  channel_offset,
1466  r,
1467  f);
1468 
1469  enqueue_kernel(ckNLMCalcDifference, w * h, num_shifts, true);
1470  enqueue_kernel(ckNLMBlur, w * h, num_shifts, true);
1471  enqueue_kernel(ckNLMCalcWeight, w * h, num_shifts, true);
1472  enqueue_kernel(ckNLMBlur, w * h, num_shifts, true);
1473  enqueue_kernel(ckNLMUpdateOutput, w * h, num_shifts, true);
1474 
1475  kernel_set_args(ckNLMNormalize, 0, out_mem, weightAccum_mem, w, h, stride);
1476  enqueue_kernel(ckNLMNormalize, w, h);
1477 
1478  return true;
1479 }
1480 
1481 bool OpenCLDevice::denoising_construct_transform(DenoisingTask *task)
1482 {
1483  cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer);
1484  cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer);
1485  cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
1486  cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer);
1487 
1488  char use_time = task->buffer.use_time ? 1 : 0;
1489 
1490  cl_kernel ckFilterConstructTransform = denoising_program(ustring("filter_construct_transform"));
1491 
1492  int arg_ofs = kernel_set_args(ckFilterConstructTransform, 0, buffer_mem, tile_info_mem);
1493  cl_mem buffers[9];
1494  for (int i = 0; i < 9; i++) {
1495  buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]);
1496  arg_ofs += kernel_set_args(ckFilterConstructTransform, arg_ofs, buffers[i]);
1497  }
1498  kernel_set_args(ckFilterConstructTransform,
1499  arg_ofs,
1500  transform_mem,
1501  rank_mem,
1502  task->filter_area,
1503  task->rect,
1504  task->buffer.pass_stride,
1505  task->buffer.frame_stride,
1506  use_time,
1507  task->radius,
1508  task->pca_threshold);
1509 
1510  enqueue_kernel(ckFilterConstructTransform, task->storage.w, task->storage.h, 256);
1511 
1512  return true;
1513 }
1514 
1515 bool OpenCLDevice::denoising_accumulate(device_ptr color_ptr,
1516  device_ptr color_variance_ptr,
1517  device_ptr scale_ptr,
1518  int frame,
1520 {
1521  cl_mem color_mem = CL_MEM_PTR(color_ptr);
1522  cl_mem color_variance_mem = CL_MEM_PTR(color_variance_ptr);
1523  cl_mem scale_mem = CL_MEM_PTR(scale_ptr);
1524 
1525  cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer);
1526  cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer);
1527  cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
1528  cl_mem XtWX_mem = CL_MEM_PTR(task->storage.XtWX.device_pointer);
1529  cl_mem XtWY_mem = CL_MEM_PTR(task->storage.XtWY.device_pointer);
1530 
1531  cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference"));
1532  cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur"));
1533  cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight"));
1534  cl_kernel ckNLMConstructGramian = denoising_program(ustring("filter_nlm_construct_gramian"));
1535 
1536  int w = task->reconstruction_state.source_w;
1537  int h = task->reconstruction_state.source_h;
1538  int stride = task->buffer.stride;
1539  int frame_offset = frame * task->buffer.frame_stride;
1540  int t = task->tile_info->frames[frame];
1541  char use_time = task->buffer.use_time ? 1 : 0;
1542 
1543  int r = task->radius;
1544  int pass_stride = task->buffer.pass_stride;
1545  int num_shifts = (2 * r + 1) * (2 * r + 1);
1546 
1547  device_sub_ptr difference(task->buffer.temporary_mem, 0, pass_stride * num_shifts);
1548  device_sub_ptr blurDifference(
1549  task->buffer.temporary_mem, pass_stride * num_shifts, pass_stride * num_shifts);
1550  cl_mem difference_mem = CL_MEM_PTR(*difference);
1551  cl_mem blurDifference_mem = CL_MEM_PTR(*blurDifference);
1552 
1553  kernel_set_args(ckNLMCalcDifference,
1554  0,
1555  color_mem,
1556  color_variance_mem,
1557  scale_mem,
1558  difference_mem,
1559  w,
1560  h,
1561  stride,
1562  pass_stride,
1563  r,
1564  pass_stride,
1565  frame_offset,
1566  1.0f,
1567  task->nlm_k_2);
1568  kernel_set_args(
1569  ckNLMBlur, 0, difference_mem, blurDifference_mem, w, h, stride, pass_stride, r, 4);
1570  kernel_set_args(
1571  ckNLMCalcWeight, 0, blurDifference_mem, difference_mem, w, h, stride, pass_stride, r, 4);
1572  kernel_set_args(ckNLMConstructGramian,
1573  0,
1574  t,
1575  blurDifference_mem,
1576  buffer_mem,
1577  transform_mem,
1578  rank_mem,
1579  XtWX_mem,
1580  XtWY_mem,
1581  task->reconstruction_state.filter_window,
1582  w,
1583  h,
1584  stride,
1585  pass_stride,
1586  r,
1587  4,
1588  frame_offset,
1589  use_time);
1590 
1591  enqueue_kernel(ckNLMCalcDifference, w * h, num_shifts, true);
1592  enqueue_kernel(ckNLMBlur, w * h, num_shifts, true);
1593  enqueue_kernel(ckNLMCalcWeight, w * h, num_shifts, true);
1594  enqueue_kernel(ckNLMBlur, w * h, num_shifts, true);
1595  enqueue_kernel(ckNLMConstructGramian, w * h, num_shifts, true, 256);
1596 
1597  return true;
1598 }
1599 
1600 bool OpenCLDevice::denoising_solve(device_ptr output_ptr, DenoisingTask *task)
1601 {
1602  cl_kernel ckFinalize = denoising_program(ustring("filter_finalize"));
1603 
1604  cl_mem output_mem = CL_MEM_PTR(output_ptr);
1605  cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
1606  cl_mem XtWX_mem = CL_MEM_PTR(task->storage.XtWX.device_pointer);
1607  cl_mem XtWY_mem = CL_MEM_PTR(task->storage.XtWY.device_pointer);
1608 
1609  int w = task->reconstruction_state.source_w;
1610  int h = task->reconstruction_state.source_h;
1611 
1612  kernel_set_args(ckFinalize,
1613  0,
1614  output_mem,
1615  rank_mem,
1616  XtWX_mem,
1617  XtWY_mem,
1618  task->filter_area,
1619  task->reconstruction_state.buffer_params,
1620  task->render_buffer.samples);
1621  enqueue_kernel(ckFinalize, w, h);
1622 
1623  return true;
1624 }
1625 
1626 bool OpenCLDevice::denoising_combine_halves(device_ptr a_ptr,
1627  device_ptr b_ptr,
1628  device_ptr mean_ptr,
1629  device_ptr variance_ptr,
1630  int r,
1631  int4 rect,
1633 {
1634  cl_mem a_mem = CL_MEM_PTR(a_ptr);
1635  cl_mem b_mem = CL_MEM_PTR(b_ptr);
1636  cl_mem mean_mem = CL_MEM_PTR(mean_ptr);
1637  cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
1638 
1639  cl_kernel ckFilterCombineHalves = denoising_program(ustring("filter_combine_halves"));
1640 
1641  kernel_set_args(ckFilterCombineHalves, 0, mean_mem, variance_mem, a_mem, b_mem, rect, r);
1642  enqueue_kernel(ckFilterCombineHalves, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
1643 
1644  return true;
1645 }
1646 
1647 bool OpenCLDevice::denoising_divide_shadow(device_ptr a_ptr,
1648  device_ptr b_ptr,
1649  device_ptr sample_variance_ptr,
1650  device_ptr sv_variance_ptr,
1651  device_ptr buffer_variance_ptr,
1653 {
1654  cl_mem a_mem = CL_MEM_PTR(a_ptr);
1655  cl_mem b_mem = CL_MEM_PTR(b_ptr);
1656  cl_mem sample_variance_mem = CL_MEM_PTR(sample_variance_ptr);
1657  cl_mem sv_variance_mem = CL_MEM_PTR(sv_variance_ptr);
1658  cl_mem buffer_variance_mem = CL_MEM_PTR(buffer_variance_ptr);
1659 
1660  cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer);
1661 
1662  cl_kernel ckFilterDivideShadow = denoising_program(ustring("filter_divide_shadow"));
1663 
1664  int arg_ofs = kernel_set_args(
1665  ckFilterDivideShadow, 0, task->render_buffer.samples, tile_info_mem);
1666  cl_mem buffers[9];
1667  for (int i = 0; i < 9; i++) {
1668  buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]);
1669  arg_ofs += kernel_set_args(ckFilterDivideShadow, arg_ofs, buffers[i]);
1670  }
1671  kernel_set_args(ckFilterDivideShadow,
1672  arg_ofs,
1673  a_mem,
1674  b_mem,
1675  sample_variance_mem,
1676  sv_variance_mem,
1677  buffer_variance_mem,
1678  task->rect,
1679  task->render_buffer.pass_stride,
1680  task->render_buffer.offset);
1681  enqueue_kernel(ckFilterDivideShadow, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
1682 
1683  return true;
1684 }
1685 
1686 bool OpenCLDevice::denoising_get_feature(int mean_offset,
1687  int variance_offset,
1688  device_ptr mean_ptr,
1689  device_ptr variance_ptr,
1690  float scale,
1692 {
1693  cl_mem mean_mem = CL_MEM_PTR(mean_ptr);
1694  cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
1695 
1696  cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer);
1697 
1698  cl_kernel ckFilterGetFeature = denoising_program(ustring("filter_get_feature"));
1699 
1700  int arg_ofs = kernel_set_args(ckFilterGetFeature, 0, task->render_buffer.samples, tile_info_mem);
1701  cl_mem buffers[9];
1702  for (int i = 0; i < 9; i++) {
1703  buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]);
1704  arg_ofs += kernel_set_args(ckFilterGetFeature, arg_ofs, buffers[i]);
1705  }
1706  kernel_set_args(ckFilterGetFeature,
1707  arg_ofs,
1708  mean_offset,
1709  variance_offset,
1710  mean_mem,
1711  variance_mem,
1712  scale,
1713  task->rect,
1714  task->render_buffer.pass_stride,
1715  task->render_buffer.offset);
1716  enqueue_kernel(ckFilterGetFeature, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
1717 
1718  return true;
1719 }
1720 
1721 bool OpenCLDevice::denoising_write_feature(int out_offset,
1722  device_ptr from_ptr,
1723  device_ptr buffer_ptr,
1725 {
1726  cl_mem from_mem = CL_MEM_PTR(from_ptr);
1727  cl_mem buffer_mem = CL_MEM_PTR(buffer_ptr);
1728 
1729  cl_kernel ckFilterWriteFeature = denoising_program(ustring("filter_write_feature"));
1730 
1731  kernel_set_args(ckFilterWriteFeature,
1732  0,
1733  task->render_buffer.samples,
1734  task->reconstruction_state.buffer_params,
1735  task->filter_area,
1736  from_mem,
1737  buffer_mem,
1738  out_offset,
1739  task->rect);
1740  enqueue_kernel(ckFilterWriteFeature, task->filter_area.z, task->filter_area.w);
1741 
1742  return true;
1743 }
1744 
1745 bool OpenCLDevice::denoising_detect_outliers(device_ptr image_ptr,
1746  device_ptr variance_ptr,
1747  device_ptr depth_ptr,
1748  device_ptr output_ptr,
1750 {
1751  cl_mem image_mem = CL_MEM_PTR(image_ptr);
1752  cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
1753  cl_mem depth_mem = CL_MEM_PTR(depth_ptr);
1754  cl_mem output_mem = CL_MEM_PTR(output_ptr);
1755 
1756  cl_kernel ckFilterDetectOutliers = denoising_program(ustring("filter_detect_outliers"));
1757 
1758  kernel_set_args(ckFilterDetectOutliers,
1759  0,
1760  image_mem,
1761  variance_mem,
1762  depth_mem,
1763  output_mem,
1764  task->rect,
1765  task->buffer.pass_stride);
1766  enqueue_kernel(ckFilterDetectOutliers, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
1767 
1768  return true;
1769 }
1770 
1771 void OpenCLDevice::denoise(RenderTile &rtile, DenoisingTask &denoising)
1772 {
1774  &OpenCLDevice::denoising_construct_transform, this, &denoising);
1775  denoising.functions.accumulate = function_bind(
1776  &OpenCLDevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising);
1777  denoising.functions.solve = function_bind(&OpenCLDevice::denoising_solve, this, _1, &denoising);
1779  &OpenCLDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
1781  &OpenCLDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
1783  &OpenCLDevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
1784  denoising.functions.get_feature = function_bind(
1785  &OpenCLDevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising);
1787  &OpenCLDevice::denoising_write_feature, this, _1, _2, _3, &denoising);
1789  &OpenCLDevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
1790 
1791  denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
1792  denoising.render_buffer.samples = rtile.sample;
1793  denoising.buffer.gpu_temporary_mem = true;
1794 
1795  denoising.run_denoising(rtile);
1796 }
1797 
1799 {
1800  /* cast arguments to cl types */
1801  cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
1802  cl_mem d_input = CL_MEM_PTR(task.shader_input);
1803  cl_mem d_output = CL_MEM_PTR(task.shader_output);
1804  cl_int d_shader_eval_type = task.shader_eval_type;
1805  cl_int d_shader_filter = task.shader_filter;
1806  cl_int d_shader_x = task.shader_x;
1807  cl_int d_shader_w = task.shader_w;
1808  cl_int d_offset = task.offset;
1809 
1810  OpenCLDevice::OpenCLProgram *program = &background_program;
1811  if (task.shader_eval_type == SHADER_EVAL_DISPLACE) {
1812  program = &displace_program;
1813  }
1814  program->wait_for_availability();
1815  cl_kernel kernel = (*program)();
1816 
1817  cl_uint start_arg_index = kernel_set_args(kernel, 0, d_data, d_input, d_output);
1818 
1819  set_kernel_arg_buffers(kernel, &start_arg_index);
1820 
1821  start_arg_index += kernel_set_args(kernel, start_arg_index, d_shader_eval_type);
1822  if (task.shader_eval_type >= SHADER_EVAL_BAKE) {
1823  start_arg_index += kernel_set_args(kernel, start_arg_index, d_shader_filter);
1824  }
1825  start_arg_index += kernel_set_args(kernel, start_arg_index, d_shader_x, d_shader_w, d_offset);
1826 
1827  for (int sample = 0; sample < task.num_samples; sample++) {
1828 
1829  if (task.get_cancel())
1830  break;
1831 
1832  kernel_set_args(kernel, start_arg_index, sample);
1833 
1834  enqueue_kernel(kernel, task.shader_w, 1);
1835 
1836  clFinish(cqCommandQueue);
1837 
1838  task.update_progress(NULL);
1839  }
1840 }
1841 
1843 {
1844  scoped_timer timer(&rtile.buffers->render_time);
1845 
1846  /* Cast arguments to cl types. */
1847  cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
1848  cl_mem d_buffer = CL_MEM_PTR(rtile.buffer);
1849  cl_int d_x = rtile.x;
1850  cl_int d_y = rtile.y;
1851  cl_int d_w = rtile.w;
1852  cl_int d_h = rtile.h;
1853  cl_int d_offset = rtile.offset;
1854  cl_int d_stride = rtile.stride;
1855 
1856  bake_program.wait_for_availability();
1857  cl_kernel kernel = bake_program();
1858 
1859  cl_uint start_arg_index = kernel_set_args(kernel, 0, d_data, d_buffer);
1860 
1861  set_kernel_arg_buffers(kernel, &start_arg_index);
1862 
1863  start_arg_index += kernel_set_args(
1864  kernel, start_arg_index, d_x, d_y, d_w, d_h, d_offset, d_stride);
1865 
1866  int start_sample = rtile.start_sample;
1867  int end_sample = rtile.start_sample + rtile.num_samples;
1868 
1869  for (int sample = start_sample; sample < end_sample; sample++) {
1870  if (task.get_cancel()) {
1871  if (task.need_finish_queue == false)
1872  break;
1873  }
1874 
1875  kernel_set_args(kernel, start_arg_index, sample);
1876 
1877  enqueue_kernel(kernel, d_w, d_h);
1878  clFinish(cqCommandQueue);
1879 
1880  rtile.sample = sample + 1;
1881 
1882  task.update_progress(&rtile, rtile.w * rtile.h);
1883  }
1884 }
1885 
1886 static bool kernel_build_opencl_2(cl_device_id cdDevice)
1887 {
1888  /* Build with OpenCL 2.0 if available, this improves performance
1889  * with AMD OpenCL drivers on Windows and Linux (legacy drivers).
1890  * Note that OpenCL selects the highest 1.x version by default,
1891  * only for 2.0 do we need the explicit compiler flag. */
1892  int version_major, version_minor;
1893  if (OpenCLInfo::get_device_version(cdDevice, &version_major, &version_minor)) {
1894  if (version_major >= 2) {
1895  /* This appears to trigger a driver bug in Radeon RX cards with certain
1896  * driver version, so don't use OpenCL 2.0 for those. */
1897  string device_name = OpenCLInfo::get_readable_device_name(cdDevice);
1898  if (string_startswith(device_name, "Radeon RX 4") ||
1899  string_startswith(device_name, "Radeon (TM) RX 4") ||
1900  string_startswith(device_name, "Radeon RX 5") ||
1901  string_startswith(device_name, "Radeon (TM) RX 5")) {
1902  char version[256] = "";
1903  int driver_major, driver_minor;
1904  clGetDeviceInfo(cdDevice, CL_DEVICE_VERSION, sizeof(version), &version, NULL);
1905  if (sscanf(version, "OpenCL 2.0 AMD-APP (%d.%d)", &driver_major, &driver_minor) == 2) {
1906  return !(driver_major == 3075 && driver_minor <= 12);
1907  }
1908  }
1909 
1910  return true;
1911  }
1912  }
1913 
1914  return false;
1915 }
1916 
1917 string OpenCLDevice::kernel_build_options(const string *debug_src)
1918 {
1919  string build_options = "-cl-no-signed-zeros -cl-mad-enable ";
1920 
1921  if (kernel_build_opencl_2(cdDevice)) {
1922  build_options += "-cl-std=CL2.0 ";
1923  }
1924 
1925  if (platform_name == "NVIDIA CUDA") {
1926  build_options +=
1927  "-D__KERNEL_OPENCL_NVIDIA__ "
1928  "-cl-nv-maxrregcount=32 "
1929  "-cl-nv-verbose ";
1930 
1931  uint compute_capability_major, compute_capability_minor;
1932  clGetDeviceInfo(cdDevice,
1933  CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,
1934  sizeof(cl_uint),
1935  &compute_capability_major,
1936  NULL);
1937  clGetDeviceInfo(cdDevice,
1938  CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,
1939  sizeof(cl_uint),
1940  &compute_capability_minor,
1941  NULL);
1942 
1943  build_options += string_printf("-D__COMPUTE_CAPABILITY__=%u ",
1944  compute_capability_major * 100 + compute_capability_minor * 10);
1945  }
1946 
1947  else if (platform_name == "Apple")
1948  build_options += "-D__KERNEL_OPENCL_APPLE__ ";
1949 
1950  else if (platform_name == "AMD Accelerated Parallel Processing")
1951  build_options += "-D__KERNEL_OPENCL_AMD__ ";
1952 
1953  else if (platform_name == "Intel(R) OpenCL") {
1954  build_options += "-D__KERNEL_OPENCL_INTEL_CPU__ ";
1955 
1956  /* Options for gdb source level kernel debugging.
1957  * this segfaults on linux currently.
1958  */
1959  if (OpenCLInfo::use_debug() && debug_src)
1960  build_options += "-g -s \"" + *debug_src + "\" ";
1961  }
1962 
1963  if (info.has_half_images) {
1964  build_options += "-D__KERNEL_CL_KHR_FP16__ ";
1965  }
1966 
1967  if (OpenCLInfo::use_debug()) {
1968  build_options += "-D__KERNEL_OPENCL_DEBUG__ ";
1969  }
1970 
1971 # ifdef WITH_CYCLES_DEBUG
1972  build_options += "-D__KERNEL_DEBUG__ ";
1973 # endif
1974 
1975 # ifdef WITH_NANOVDB
1976  if (info.has_nanovdb) {
1977  build_options += "-DWITH_NANOVDB ";
1978  }
1979 # endif
1980 
1981  return build_options;
1982 }
1983 
1984 /* TODO(sergey): In the future we can use variadic templates, once
1985  * C++0x is allowed. Should allow to clean this up a bit.
1986  */
1987 int OpenCLDevice::kernel_set_args(cl_kernel kernel,
1988  int start_argument_index,
1989  const ArgumentWrapper &arg1,
1990  const ArgumentWrapper &arg2,
1991  const ArgumentWrapper &arg3,
1992  const ArgumentWrapper &arg4,
1993  const ArgumentWrapper &arg5,
1994  const ArgumentWrapper &arg6,
1995  const ArgumentWrapper &arg7,
1996  const ArgumentWrapper &arg8,
1997  const ArgumentWrapper &arg9,
1998  const ArgumentWrapper &arg10,
1999  const ArgumentWrapper &arg11,
2000  const ArgumentWrapper &arg12,
2001  const ArgumentWrapper &arg13,
2002  const ArgumentWrapper &arg14,
2003  const ArgumentWrapper &arg15,
2004  const ArgumentWrapper &arg16,
2005  const ArgumentWrapper &arg17,
2006  const ArgumentWrapper &arg18,
2007  const ArgumentWrapper &arg19,
2008  const ArgumentWrapper &arg20,
2009  const ArgumentWrapper &arg21,
2010  const ArgumentWrapper &arg22,
2011  const ArgumentWrapper &arg23,
2012  const ArgumentWrapper &arg24,
2013  const ArgumentWrapper &arg25,
2014  const ArgumentWrapper &arg26,
2015  const ArgumentWrapper &arg27,
2016  const ArgumentWrapper &arg28,
2017  const ArgumentWrapper &arg29,
2018  const ArgumentWrapper &arg30,
2019  const ArgumentWrapper &arg31,
2020  const ArgumentWrapper &arg32,
2021  const ArgumentWrapper &arg33)
2022 {
2023  int current_arg_index = 0;
2024 # define FAKE_VARARG_HANDLE_ARG(arg) \
2025  do { \
2026  if (arg.pointer != NULL) { \
2027  opencl_assert(clSetKernelArg( \
2028  kernel, start_argument_index + current_arg_index, arg.size, arg.pointer)); \
2029  ++current_arg_index; \
2030  } \
2031  else { \
2032  return current_arg_index; \
2033  } \
2034  } while (false)
2035  FAKE_VARARG_HANDLE_ARG(arg1);
2036  FAKE_VARARG_HANDLE_ARG(arg2);
2037  FAKE_VARARG_HANDLE_ARG(arg3);
2038  FAKE_VARARG_HANDLE_ARG(arg4);
2039  FAKE_VARARG_HANDLE_ARG(arg5);
2040  FAKE_VARARG_HANDLE_ARG(arg6);
2041  FAKE_VARARG_HANDLE_ARG(arg7);
2042  FAKE_VARARG_HANDLE_ARG(arg8);
2043  FAKE_VARARG_HANDLE_ARG(arg9);
2044  FAKE_VARARG_HANDLE_ARG(arg10);
2045  FAKE_VARARG_HANDLE_ARG(arg11);
2046  FAKE_VARARG_HANDLE_ARG(arg12);
2047  FAKE_VARARG_HANDLE_ARG(arg13);
2048  FAKE_VARARG_HANDLE_ARG(arg14);
2049  FAKE_VARARG_HANDLE_ARG(arg15);
2050  FAKE_VARARG_HANDLE_ARG(arg16);
2051  FAKE_VARARG_HANDLE_ARG(arg17);
2052  FAKE_VARARG_HANDLE_ARG(arg18);
2053  FAKE_VARARG_HANDLE_ARG(arg19);
2054  FAKE_VARARG_HANDLE_ARG(arg20);
2055  FAKE_VARARG_HANDLE_ARG(arg21);
2056  FAKE_VARARG_HANDLE_ARG(arg22);
2057  FAKE_VARARG_HANDLE_ARG(arg23);
2058  FAKE_VARARG_HANDLE_ARG(arg24);
2059  FAKE_VARARG_HANDLE_ARG(arg25);
2060  FAKE_VARARG_HANDLE_ARG(arg26);
2061  FAKE_VARARG_HANDLE_ARG(arg27);
2062  FAKE_VARARG_HANDLE_ARG(arg28);
2063  FAKE_VARARG_HANDLE_ARG(arg29);
2064  FAKE_VARARG_HANDLE_ARG(arg30);
2065  FAKE_VARARG_HANDLE_ARG(arg31);
2066  FAKE_VARARG_HANDLE_ARG(arg32);
2067  FAKE_VARARG_HANDLE_ARG(arg33);
2068 # undef FAKE_VARARG_HANDLE_ARG
2069  return current_arg_index;
2070 }
2071 
2072 void OpenCLDevice::release_kernel_safe(cl_kernel kernel)
2073 {
2074  if (kernel) {
2075  clReleaseKernel(kernel);
2076  }
2077 }
2078 
2079 void OpenCLDevice::release_mem_object_safe(cl_mem mem)
2080 {
2081  if (mem != NULL) {
2082  clReleaseMemObject(mem);
2083  }
2084 }
2085 
2086 void OpenCLDevice::release_program_safe(cl_program program)
2087 {
2088  if (program) {
2089  clReleaseProgram(program);
2090  }
2091 }
2092 
2093 /* ** Those guys are for working around some compiler-specific bugs ** */
2094 
2095 cl_program OpenCLDevice::load_cached_kernel(ustring key, thread_scoped_lock &cache_locker)
2096 {
2097  return OpenCLCache::get_program(cpPlatform, cdDevice, key, cache_locker);
2098 }
2099 
2100 void OpenCLDevice::store_cached_kernel(cl_program program,
2101  ustring key,
2102  thread_scoped_lock &cache_locker)
2103 {
2104  OpenCLCache::store_program(cpPlatform, cdDevice, program, key, cache_locker);
2105 }
2106 
2107 Device *opencl_create_split_device(DeviceInfo &info,
2108  Stats &stats,
2109  Profiler &profiler,
2110  bool background)
2111 {
2112  return new OpenCLDevice(info, stats, profiler, background);
2113 }
2114 
2116 
2117 #endif
sqrt(x)+1/max(0
unsigned char uchar
Definition: BLI_sys_types.h:86
unsigned int uint
Definition: BLI_sys_types.h:83
_GL_VOID GLfloat value _GL_VOID_RET _GL_VOID const GLuint GLboolean *residences _GL_BOOL_RET _GL_VOID GLsizei GLfloat GLfloat GLfloat GLfloat const GLubyte *bitmap _GL_VOID_RET _GL_VOID GLenum const void *lists _GL_VOID_RET _GL_VOID const GLdouble *equation _GL_VOID_RET _GL_VOID GLdouble GLdouble blue _GL_VOID_RET _GL_VOID GLfloat GLfloat blue _GL_VOID_RET _GL_VOID GLint GLint blue _GL_VOID_RET _GL_VOID GLshort GLshort blue _GL_VOID_RET _GL_VOID GLubyte GLubyte blue _GL_VOID_RET _GL_VOID GLuint GLuint blue _GL_VOID_RET _GL_VOID GLushort GLushort blue _GL_VOID_RET _GL_VOID GLbyte GLbyte GLbyte alpha _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble alpha _GL_VOID_RET _GL_VOID GLfloat GLfloat GLfloat alpha _GL_VOID_RET _GL_VOID GLint GLint GLint alpha _GL_VOID_RET _GL_VOID GLshort GLshort GLshort alpha _GL_VOID_RET _GL_VOID GLubyte GLubyte GLubyte alpha _GL_VOID_RET _GL_VOID GLuint GLuint GLuint alpha _GL_VOID_RET _GL_VOID GLushort GLushort GLushort alpha _GL_VOID_RET _GL_VOID GLenum mode _GL_VOID_RET _GL_VOID GLint GLsizei GLsizei GLenum type _GL_VOID_RET _GL_VOID GLsizei GLenum GLenum const void *pixels _GL_VOID_RET _GL_VOID const void *pointer _GL_VOID_RET _GL_VOID GLdouble v _GL_VOID_RET _GL_VOID GLfloat v _GL_VOID_RET _GL_VOID GLint GLint i2 _GL_VOID_RET _GL_VOID GLint j _GL_VOID_RET _GL_VOID GLfloat param _GL_VOID_RET _GL_VOID GLint param _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble GLdouble GLdouble zFar _GL_VOID_RET _GL_UINT GLdouble *equation _GL_VOID_RET _GL_VOID GLenum GLint *params _GL_VOID_RET _GL_VOID GLenum GLfloat *v _GL_VOID_RET _GL_VOID GLenum GLfloat *params _GL_VOID_RET _GL_VOID GLfloat *values _GL_VOID_RET _GL_VOID GLushort *values _GL_VOID_RET _GL_VOID GLenum GLfloat *params _GL_VOID_RET _GL_VOID GLenum GLdouble *params _GL_VOID_RET _GL_VOID GLenum GLint *params _GL_VOID_RET _GL_VOID GLsizei const void *pointer _GL_VOID_RET _GL_VOID GLsizei const void *pointer _GL_VOID_RET _GL_BOOL GLfloat param _GL_VOID_RET _GL_VOID GLint param _GL_VOID_RET _GL_VOID GLenum GLfloat param _GL_VOID_RET _GL_VOID GLenum GLint param _GL_VOID_RET _GL_VOID GLushort pattern _GL_VOID_RET _GL_VOID GLdouble GLdouble GLint GLint const GLdouble *points _GL_VOID_RET _GL_VOID GLdouble GLdouble GLint GLint GLdouble GLdouble GLint GLint const GLdouble *points _GL_VOID_RET _GL_VOID GLdouble GLdouble u2 _GL_VOID_RET _GL_VOID GLdouble GLdouble GLint GLdouble GLdouble v2 _GL_VOID_RET _GL_VOID GLenum GLfloat param _GL_VOID_RET _GL_VOID GLenum GLint param _GL_VOID_RET _GL_VOID GLenum mode _GL_VOID_RET _GL_VOID GLdouble GLdouble nz _GL_VOID_RET _GL_VOID GLfloat GLfloat nz _GL_VOID_RET _GL_VOID GLint GLint nz _GL_VOID_RET _GL_VOID GLshort GLshort nz _GL_VOID_RET _GL_VOID GLsizei const void *pointer _GL_VOID_RET _GL_VOID GLsizei const GLfloat *values _GL_VOID_RET _GL_VOID GLsizei const GLushort *values _GL_VOID_RET _GL_VOID GLint param _GL_VOID_RET _GL_VOID const GLuint const GLclampf *priorities _GL_VOID_RET _GL_VOID GLdouble y _GL_VOID_RET _GL_VOID GLfloat y _GL_VOID_RET _GL_VOID GLint y _GL_VOID_RET _GL_VOID GLshort y _GL_VOID_RET _GL_VOID GLdouble GLdouble z _GL_VOID_RET _GL_VOID GLfloat GLfloat z _GL_VOID_RET _GL_VOID GLint GLint z _GL_VOID_RET _GL_VOID GLshort GLshort z _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble w _GL_VOID_RET _GL_VOID GLfloat GLfloat GLfloat w _GL_VOID_RET _GL_VOID GLint GLint GLint w _GL_VOID_RET _GL_VOID GLshort GLshort GLshort w _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble y2 _GL_VOID_RET _GL_VOID GLfloat GLfloat GLfloat y2 _GL_VOID_RET _GL_VOID GLint GLint GLint y2 _GL_VOID_RET _GL_VOID GLshort GLshort GLshort y2 _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble z _GL_VOID_RET _GL_VOID GLdouble GLdouble z _GL_VOID_RET _GL_VOID GLuint *buffer _GL_VOID_RET _GL_VOID GLdouble t _GL_VOID_RET _GL_VOID GLfloat t _GL_VOID_RET _GL_VOID GLint t _GL_VOID_RET _GL_VOID GLshort t _GL_VOID_RET _GL_VOID GLdouble GLdouble r _GL_VOID_RET _GL_VOID GLfloat GLfloat r _GL_VOID_RET _GL_VOID GLint GLint r _GL_VOID_RET _GL_VOID GLshort GLshort r _GL_VOID_RET _GL_VOID GLdouble GLdouble r
_GL_VOID GLfloat value _GL_VOID_RET _GL_VOID const GLuint GLboolean *residences _GL_BOOL_RET _GL_VOID GLsizei GLfloat GLfloat GLfloat GLfloat const GLubyte *bitmap _GL_VOID_RET _GL_VOID GLenum type
_GL_VOID GLfloat value _GL_VOID_RET _GL_VOID const GLuint * textures
_GL_VOID GLfloat value _GL_VOID_RET _GL_VOID const GLuint GLboolean *residences _GL_BOOL_RET _GL_VOID GLsizei GLfloat GLfloat GLfloat GLfloat const GLubyte *bitmap _GL_VOID_RET _GL_VOID GLenum const void *lists _GL_VOID_RET _GL_VOID const GLdouble *equation _GL_VOID_RET _GL_VOID GLdouble GLdouble blue _GL_VOID_RET _GL_VOID GLfloat GLfloat blue _GL_VOID_RET _GL_VOID GLint GLint blue _GL_VOID_RET _GL_VOID GLshort GLshort blue _GL_VOID_RET _GL_VOID GLubyte GLubyte blue _GL_VOID_RET _GL_VOID GLuint GLuint blue _GL_VOID_RET _GL_VOID GLushort GLushort blue _GL_VOID_RET _GL_VOID GLbyte GLbyte GLbyte alpha _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble alpha _GL_VOID_RET _GL_VOID GLfloat GLfloat GLfloat alpha _GL_VOID_RET _GL_VOID GLint GLint GLint alpha _GL_VOID_RET _GL_VOID GLshort GLshort GLshort alpha _GL_VOID_RET _GL_VOID GLubyte GLubyte GLubyte alpha _GL_VOID_RET _GL_VOID GLuint GLuint GLuint alpha _GL_VOID_RET _GL_VOID GLushort GLushort GLushort alpha _GL_VOID_RET _GL_VOID GLenum mode _GL_VOID_RET _GL_VOID GLint y
_GL_VOID GLfloat value _GL_VOID_RET _GL_VOID const GLuint GLboolean *residences _GL_BOOL_RET _GL_VOID GLsizei GLfloat GLfloat GLfloat GLfloat const GLubyte *bitmap _GL_VOID_RET _GL_VOID GLenum const void *lists _GL_VOID_RET _GL_VOID const GLdouble *equation _GL_VOID_RET _GL_VOID GLdouble GLdouble blue _GL_VOID_RET _GL_VOID GLfloat GLfloat blue _GL_VOID_RET _GL_VOID GLint GLint blue _GL_VOID_RET _GL_VOID GLshort GLshort blue _GL_VOID_RET _GL_VOID GLubyte GLubyte blue _GL_VOID_RET _GL_VOID GLuint GLuint blue _GL_VOID_RET _GL_VOID GLushort GLushort blue _GL_VOID_RET _GL_VOID GLbyte GLbyte GLbyte alpha _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble alpha _GL_VOID_RET _GL_VOID GLfloat GLfloat GLfloat alpha _GL_VOID_RET _GL_VOID GLint GLint GLint alpha _GL_VOID_RET _GL_VOID GLshort GLshort GLshort alpha _GL_VOID_RET _GL_VOID GLubyte GLubyte GLubyte alpha _GL_VOID_RET _GL_VOID GLuint GLuint GLuint alpha _GL_VOID_RET _GL_VOID GLushort GLushort GLushort alpha _GL_VOID_RET _GL_VOID GLenum mode _GL_VOID_RET _GL_VOID GLint GLsizei GLsizei GLenum type _GL_VOID_RET _GL_VOID GLsizei GLenum GLenum const void *pixels _GL_VOID_RET _GL_VOID const void *pointer _GL_VOID_RET _GL_VOID GLdouble v _GL_VOID_RET _GL_VOID GLfloat v _GL_VOID_RET _GL_VOID GLint GLint i2 _GL_VOID_RET _GL_VOID GLint j _GL_VOID_RET _GL_VOID GLfloat param _GL_VOID_RET _GL_VOID GLint param _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble GLdouble GLdouble zFar _GL_VOID_RET _GL_UINT GLdouble *equation _GL_VOID_RET _GL_VOID GLenum GLint *params _GL_VOID_RET _GL_VOID GLenum GLfloat *v _GL_VOID_RET _GL_VOID GLenum GLfloat *params _GL_VOID_RET _GL_VOID GLfloat *values _GL_VOID_RET _GL_VOID GLushort *values _GL_VOID_RET _GL_VOID GLenum GLfloat *params _GL_VOID_RET _GL_VOID GLenum GLdouble *params _GL_VOID_RET _GL_VOID GLenum GLint *params _GL_VOID_RET _GL_VOID GLsizei stride
_GL_VOID GLfloat value _GL_VOID_RET _GL_VOID const GLuint GLboolean *residences _GL_BOOL_RET _GL_VOID GLsizei GLfloat GLfloat GLfloat GLfloat const GLubyte *bitmap _GL_VOID_RET _GL_VOID GLenum const void *lists _GL_VOID_RET _GL_VOID const GLdouble *equation _GL_VOID_RET _GL_VOID GLdouble GLdouble blue _GL_VOID_RET _GL_VOID GLfloat GLfloat blue _GL_VOID_RET _GL_VOID GLint GLint blue _GL_VOID_RET _GL_VOID GLshort GLshort blue _GL_VOID_RET _GL_VOID GLubyte GLubyte blue _GL_VOID_RET _GL_VOID GLuint GLuint blue _GL_VOID_RET _GL_VOID GLushort GLushort blue _GL_VOID_RET _GL_VOID GLbyte GLbyte GLbyte alpha _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble alpha _GL_VOID_RET _GL_VOID GLfloat GLfloat GLfloat alpha _GL_VOID_RET _GL_VOID GLint GLint GLint alpha _GL_VOID_RET _GL_VOID GLshort GLshort GLshort alpha _GL_VOID_RET _GL_VOID GLubyte GLubyte GLubyte alpha _GL_VOID_RET _GL_VOID GLuint GLuint GLuint alpha _GL_VOID_RET _GL_VOID GLushort GLushort GLushort alpha _GL_VOID_RET _GL_VOID GLenum mode _GL_VOID_RET _GL_VOID GLint GLsizei GLsizei GLenum type _GL_VOID_RET _GL_VOID GLsizei GLenum GLenum const void *pixels _GL_VOID_RET _GL_VOID const void *pointer _GL_VOID_RET _GL_VOID GLdouble v _GL_VOID_RET _GL_VOID GLfloat v _GL_VOID_RET _GL_VOID GLint GLint i2 _GL_VOID_RET _GL_VOID GLint j _GL_VOID_RET _GL_VOID GLfloat param _GL_VOID_RET _GL_VOID GLint param _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble GLdouble GLdouble zFar _GL_VOID_RET _GL_UINT GLdouble *equation _GL_VOID_RET _GL_VOID GLenum GLint *params _GL_VOID_RET _GL_VOID GLenum GLfloat *v _GL_VOID_RET _GL_VOID GLenum GLfloat *params _GL_VOID_RET _GL_VOID GLfloat *values _GL_VOID_RET _GL_VOID GLushort *values _GL_VOID_RET _GL_VOID GLenum GLfloat *params _GL_VOID_RET _GL_VOID GLenum GLdouble *params _GL_VOID_RET _GL_VOID GLenum GLint *params _GL_VOID_RET _GL_VOID GLsizei const void *pointer _GL_VOID_RET _GL_VOID GLsizei const void *pointer _GL_VOID_RET _GL_BOOL GLfloat param _GL_VOID_RET _GL_VOID GLint param _GL_VOID_RET _GL_VOID GLenum GLfloat param _GL_VOID_RET _GL_VOID GLenum GLint param _GL_VOID_RET _GL_VOID GLushort pattern _GL_VOID_RET _GL_VOID GLdouble GLdouble GLint GLint const GLdouble *points _GL_VOID_RET _GL_VOID GLdouble GLdouble GLint GLint GLdouble GLdouble GLint GLint const GLdouble *points _GL_VOID_RET _GL_VOID GLdouble GLdouble u2 _GL_VOID_RET _GL_VOID GLdouble GLdouble GLint GLdouble GLdouble v2 _GL_VOID_RET _GL_VOID GLenum GLfloat param _GL_VOID_RET _GL_VOID GLenum GLint param _GL_VOID_RET _GL_VOID GLenum mode _GL_VOID_RET _GL_VOID GLdouble GLdouble nz _GL_VOID_RET _GL_VOID GLfloat GLfloat nz _GL_VOID_RET _GL_VOID GLint GLint nz _GL_VOID_RET _GL_VOID GLshort GLshort nz _GL_VOID_RET _GL_VOID GLsizei const void *pointer _GL_VOID_RET _GL_VOID GLsizei const GLfloat *values _GL_VOID_RET _GL_VOID GLsizei const GLushort *values _GL_VOID_RET _GL_VOID GLint param _GL_VOID_RET _GL_VOID const GLuint const GLclampf *priorities _GL_VOID_RET _GL_VOID GLdouble y _GL_VOID_RET _GL_VOID GLfloat y _GL_VOID_RET _GL_VOID GLint y _GL_VOID_RET _GL_VOID GLshort y _GL_VOID_RET _GL_VOID GLdouble GLdouble z _GL_VOID_RET _GL_VOID GLfloat GLfloat z _GL_VOID_RET _GL_VOID GLint GLint z _GL_VOID_RET _GL_VOID GLshort GLshort z _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble w _GL_VOID_RET _GL_VOID GLfloat GLfloat GLfloat w _GL_VOID_RET _GL_VOID GLint GLint GLint w _GL_VOID_RET _GL_VOID GLshort GLshort GLshort w _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble y2 _GL_VOID_RET _GL_VOID GLfloat GLfloat GLfloat y2 _GL_VOID_RET _GL_VOID GLint GLint GLint y2 _GL_VOID_RET _GL_VOID GLshort GLshort GLshort y2 _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble z _GL_VOID_RET _GL_VOID GLdouble GLdouble z _GL_VOID_RET _GL_VOID GLuint *buffer _GL_VOID_RET _GL_VOID GLdouble t _GL_VOID_RET _GL_VOID GLfloat t _GL_VOID_RET _GL_VOID GLint t _GL_VOID_RET _GL_VOID GLshort t _GL_VOID_RET _GL_VOID GLdouble t
static DBVT_INLINE btScalar size(const btDbvtVolume &a)
Definition: btDbvt.cpp:52
SIMD_FORCE_INLINE const btScalar & w() const
Return the w value.
Definition: btQuadWord.h:119
void run_denoising(RenderTile &tile)
struct DenoisingTask::RenderBuffers render_buffer
struct DenoisingTask::DeviceFunctions functions
struct DenoisingTask::DenoiseBuffers buffer
int num
Definition: device.h:77
bool use_integrator_branched
Definition: device.h:159
string get_build_options() const
Definition: device.h:233
virtual bool enqueue_split_kernel_data_init(const KernelDimensions &dim, RenderTile &rtile, int num_global_elements, device_memory &kernel_globals, device_memory &kernel_data_, device_memory &split_data, device_memory &ray_state, device_memory &queue_index, device_memory &use_queues_flag, device_memory &work_pool_wgs)=0
virtual SplitKernelFunction * get_split_kernel_function(const string &kernel_name, const DeviceRequestedFeatures &)=0
size_t max_elements_for_max_buffer_size(device_memory &kg, device_memory &data, uint64_t max_buffer_size)
virtual int2 split_kernel_local_size()=0
virtual uint64_t state_buffer_size(device_memory &kg, device_memory &data, size_t num_threads)=0
virtual int2 split_kernel_global_size(device_memory &kg, device_memory &data, DeviceTask &task)=0
Definition: device.h:293
string get_hex()
Definition: util_md5.cpp:366
void append(const uint8_t *data, int size)
Definition: util_md5.cpp:274
double render_time
Definition: buffers.h:82
int stride
Definition: buffers.h:143
int sample
Definition: buffers.h:140
RenderBuffers * buffers
Definition: buffers.h:152
int num_samples
Definition: buffers.h:139
@ PATH_TRACE
Definition: buffers.h:134
device_ptr buffer
Definition: buffers.h:146
Task task
Definition: buffers.h:136
int offset
Definition: buffers.h:142
int start_sample
Definition: buffers.h:138
virtual bool enqueue(const KernelDimensions &dim, device_memory &kg, device_memory &data)=0
size_t mem_used
Definition: util_stats.h:48
void mem_free(size_t size)
Definition: util_stats.h:42
void mem_alloc(size_t size)
Definition: util_stats.h:36
const char * name
MemoryType type
size_t memory_elements_size(int elements)
void * host_pointer
size_t memory_size()
device_ptr device_pointer
size_t device_size
CCL_NAMESPACE_BEGIN struct Options options
void * user_data
#define function_bind
DeviceKernelStatus
Definition: device.h:63
@ DEVICE_KERNEL_USING_FEATURE_KERNEL
Definition: device.h:65
@ MEM_GLOBAL
Definition: device_memory.h:39
@ MEM_TEXTURE
Definition: device_memory.h:40
@ MEM_READ_WRITE
Definition: device_memory.h:37
@ MEM_READ_ONLY
Definition: device_memory.h:36
TaskPool * task_pool
static FT_Error err
Definition: freetypefont.c:52
ccl_device_intersect bool scene_intersect(KernelGlobals *kg, const Ray *ray, const uint visibility, Intersection *isect)
#define kernel_data
#define ccl_constant
#define ccl_global
#define CCL_NAMESPACE_END
#define make_int4(x, y, z, w)
#define make_int2(x, y)
void KERNEL_FUNCTION_FULL_NAME() data_init(KernelGlobals *kg, ccl_constant KernelData *data, ccl_global void *split_data_buffer, int num_elements, ccl_global char *ray_state, int start_sample, int end_sample, int sx, int sy, int sw, int sh, int offset, int stride, ccl_global int *Queue_index, int queuesize, ccl_global char *use_queues_flag, ccl_global unsigned int *work_pool_wgs, unsigned int num_samples, ccl_global float *buffer)
void KERNEL_FUNCTION_FULL_NAME() shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int filter, int i, int offset, int sample)
CCL_NAMESPACE_BEGIN ccl_device void kernel_data_init(KernelGlobals *kg, ccl_constant KernelData *data, ccl_global void *split_data_buffer, int num_elements, ccl_global char *ray_state, int start_sample, int end_sample, int sx, int sy, int sw, int sh, int offset, int stride, ccl_global int *Queue_index, int queuesize, ccl_global char *use_queues_flag, ccl_global unsigned int *work_pools, unsigned int num_samples, ccl_global float *buffer)
ccl_device_noinline_cpu float3 indirect_background(KernelGlobals *kg, ShaderData *emission_sd, ccl_addr_space PathState *state, ccl_global float *buffer, ccl_addr_space Ray *ray)
__kernel void ccl_constant KernelData ccl_global void ccl_global char ccl_global int ccl_global char ccl_global unsigned int ccl_global float * buffer
__kernel void ccl_constant KernelData ccl_global void ccl_global char ccl_global int ccl_global char * use_queues_flag
__kernel void ccl_constant KernelData ccl_global void ccl_global char ccl_global int * queue_index
__kernel void ccl_constant KernelData ccl_global void ccl_global char * ray_state
__kernel void ccl_constant KernelData ccl_global void ccl_global char ccl_global int ccl_global char ccl_global unsigned int * work_pools
@ SHADER_EVAL_DISPLACE
Definition: kernel_types.h:197
@ SHADER_EVAL_BAKE
Definition: kernel_types.h:200
static void error(const char *str)
Definition: meshlaplacian.c:65
static unsigned a[3]
Definition: RandGen.cpp:92
static void sample(SocketReader *reader, int x, int y, float color[4])
ListBase threads
list of all thread for every CPUDevice in cpudevices a thread exists.
struct blender::compositor::@172::@174 task
struct blender::compositor::@172::@175 opencl
static int bake(const BakeAPIRender *bkr, Object *ob_low, const ListBase *selected_objects, ReportList *reports)
#define min(a, b)
Definition: sort.c:51
unsigned char uint8_t
Definition: stdint.h:81
unsigned __int64 uint64_t
Definition: stdint.h:93
function< bool(device_ptr a_ptr, device_ptr b_ptr, device_ptr mean_ptr, device_ptr variance_ptr, int r, int4 rect)> combine_halves
function< bool(device_ptr image_ptr, device_ptr variance_ptr, device_ptr depth_ptr, device_ptr output_ptr)> detect_outliers
function< bool(int out_offset, device_ptr frop_ptr, device_ptr buffer_ptr)> write_feature
function< bool(device_ptr output_ptr)> solve
function< bool(device_ptr a_ptr, device_ptr b_ptr, device_ptr sample_variance_ptr, device_ptr sv_variance_ptr, device_ptr buffer_variance_ptr)> divide_shadow
function< bool()> construct_transform
function< bool(int mean_offset, int variance_offset, device_ptr mean_ptr, device_ptr variance_ptr, float scale)> get_feature
function< bool(device_ptr color_ptr, device_ptr color_variance_ptr, device_ptr scale_ptr, int frame)> accumulate
function< bool(device_ptr image_ptr, device_ptr guide_ptr, device_ptr variance_ptr, device_ptr out_ptr)> non_local_means
void cancel()
Definition: util_task.cpp:54
uint64_t data
Definition: util_texture.h:97
#define NODE_GROUP_LEVEL_MAX
Definition: svm_types.h:46
#define NODE_FEATURE_ALL
Definition: svm_types.h:57
#define NODE_FEATURE_VOLUME
Definition: svm_types.h:48
float max
void util_aligned_free(void *ptr)
CCL_NAMESPACE_BEGIN void * util_aligned_malloc(size_t size, int alignment)
DebugFlags & DebugFlags()
Definition: util_debug.h:205
#define VLOG(severity)
Definition: util_logging.h:50
void path_init(const string &path, const string &user_path)
Definition: util_path.cpp:338
string string_human_readable_size(size_t size)
string string_human_readable_number(size_t num)
bool string_startswith(const string &s, const char *start)
CCL_NAMESPACE_BEGIN string string_printf(const char *format,...)
Definition: util_string.cpp:32
std::unique_lock< std::mutex > thread_scoped_lock
Definition: util_thread.h:41
ccl_device_inline size_t round_down(size_t x, size_t multiple)
Definition: util_types.h:80
uint64_t device_ptr
Definition: util_types.h:62
PointerRNA * ptr
Definition: wm_files.c:3157