Blender  V2.93
device_cpu.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2011-2013 Blender Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include <stdlib.h>
18 #include <string.h>
19 
20 /* So ImathMath is included before our kernel_cpu_compat. */
21 #ifdef WITH_OSL
22 /* So no context pollution happens from indirectly included windows.h */
23 # include "util/util_windows.h"
24 # include <OSL/oslexec.h>
25 #endif
26 
27 #ifdef WITH_EMBREE
28 # include <embree3/rtcore.h>
29 #endif
30 
31 #include "device/device.h"
33 #include "device/device_intern.h"
35 
36 // clang-format off
37 #include "kernel/kernel.h"
39 #include "kernel/kernel_types.h"
41 #include "kernel/kernel_globals.h"
43 
44 #include "kernel/filter/filter.h"
45 
46 #include "kernel/osl/osl_shader.h"
47 #include "kernel/osl/osl_globals.h"
48 // clang-format on
49 
50 #include "bvh/bvh_embree.h"
51 
52 #include "render/buffers.h"
53 #include "render/coverage.h"
54 
55 #include "util/util_debug.h"
56 #include "util/util_foreach.h"
57 #include "util/util_function.h"
58 #include "util/util_logging.h"
59 #include "util/util_map.h"
61 #include "util/util_opengl.h"
62 #include "util/util_optimization.h"
63 #include "util/util_progress.h"
64 #include "util/util_system.h"
65 #include "util/util_task.h"
66 #include "util/util_thread.h"
67 
69 
70 class CPUDevice;
71 
72 /* Has to be outside of the class to be shared across template instantiations. */
73 static const char *logged_architecture = "";
74 
75 template<typename F> class KernelFunctions {
76  public:
78  {
79  kernel = (F)NULL;
80  }
81 
83  F kernel_default, F kernel_sse2, F kernel_sse3, F kernel_sse41, F kernel_avx, F kernel_avx2)
84  {
85  const char *architecture_name = "default";
86  kernel = kernel_default;
87 
88  /* Silence potential warnings about unused variables
89  * when compiling without some architectures. */
90  (void)kernel_sse2;
91  (void)kernel_sse3;
92  (void)kernel_sse41;
93  (void)kernel_avx;
94  (void)kernel_avx2;
95 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX2
97  architecture_name = "AVX2";
98  kernel = kernel_avx2;
99  }
100  else
101 #endif
102 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_AVX
104  architecture_name = "AVX";
105  kernel = kernel_avx;
106  }
107  else
108 #endif
109 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE41
111  architecture_name = "SSE4.1";
112  kernel = kernel_sse41;
113  }
114  else
115 #endif
116 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE3
118  architecture_name = "SSE3";
119  kernel = kernel_sse3;
120  }
121  else
122 #endif
123 #ifdef WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
125  architecture_name = "SSE2";
126  kernel = kernel_sse2;
127  }
128 #else
129  {
130  /* Dummy to prevent the architecture if below become
131  * conditional when WITH_CYCLES_OPTIMIZED_KERNEL_SSE2
132  * is not defined. */
133  }
134 #endif
135 
136  if (strcmp(architecture_name, logged_architecture) != 0) {
137  VLOG(1) << "Will be using " << architecture_name << " kernels.";
138  logged_architecture = architecture_name;
139  }
140  }
141 
142  inline F operator()() const
143  {
144  assert(kernel);
145  return kernel;
146  }
147 
148  protected:
150 };
151 
153  CPUDevice *device;
154 
155  public:
156  explicit CPUSplitKernel(CPUDevice *device);
157 
158  virtual bool enqueue_split_kernel_data_init(const KernelDimensions &dim,
159  RenderTile &rtile,
160  int num_global_elements,
161  device_memory &kernel_globals,
162  device_memory &kernel_data_,
163  device_memory &split_data,
167  device_memory &work_pool_wgs);
168 
169  virtual SplitKernelFunction *get_split_kernel_function(const string &kernel_name,
170  const DeviceRequestedFeatures &);
171  virtual int2 split_kernel_local_size();
173  virtual uint64_t state_buffer_size(device_memory &kg, device_memory &data, size_t num_threads);
174 };
175 
176 class CPUDevice : public Device {
177  public:
179  KernelGlobals kernel_globals;
180 
183 
184 #ifdef WITH_OSL
185  OSLGlobals osl_globals;
186 #endif
187 #ifdef WITH_OPENIMAGEDENOISE
188  oidn::DeviceRef oidn_device;
189  oidn::FilterRef oidn_filter;
190 #endif
192 #ifdef WITH_EMBREE
193  RTCScene embree_scene = NULL;
194  RTCDevice embree_device;
195 #endif
196 
198 
200 
201  KernelFunctions<void (*)(KernelGlobals *, float *, int, int, int, int, int)> path_trace_kernel;
202  KernelFunctions<void (*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)>
204  KernelFunctions<void (*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)>
206  KernelFunctions<void (*)(KernelGlobals *, uint4 *, float4 *, int, int, int, int, int)>
208  KernelFunctions<void (*)(KernelGlobals *, float *, int, int, int, int, int)> bake_kernel;
209 
210  KernelFunctions<void (*)(
211  int, TileInfo *, int, int, float *, float *, float *, float *, float *, int *, int, int)>
213  KernelFunctions<void (*)(
214  int, TileInfo *, int, int, int, int, float *, float *, float, int *, int, int)>
216  KernelFunctions<void (*)(int, int, int, int *, float *, float *, int, int *)>
218  KernelFunctions<void (*)(int, int, float *, float *, float *, float *, int *, int)>
220  KernelFunctions<void (*)(int, int, float *, float *, float *, float *, int *, int)>
222 
223  KernelFunctions<void (*)(
224  int, int, float *, float *, float *, float *, int *, int, int, int, float, float)>
226  KernelFunctions<void (*)(float *, float *, int *, int, int)> filter_nlm_blur_kernel;
227  KernelFunctions<void (*)(float *, float *, int *, int, int)> filter_nlm_calc_weight_kernel;
228  KernelFunctions<void (*)(
229  int, int, float *, float *, float *, float *, float *, int *, int, int, int)>
231  KernelFunctions<void (*)(float *, float *, int *, int)> filter_nlm_normalize_kernel;
232 
233  KernelFunctions<void (*)(
234  float *, TileInfo *, int, int, int, float *, int *, int *, int, int, bool, int, float)>
236  KernelFunctions<void (*)(int,
237  int,
238  int,
239  float *,
240  float *,
241  float *,
242  int *,
243  float *,
244  float3 *,
245  int *,
246  int *,
247  int,
248  int,
249  int,
250  int,
251  bool)>
253  KernelFunctions<void (*)(int, int, int, float *, int *, float *, float3 *, int *, int)>
255 
256  KernelFunctions<void (*)(KernelGlobals *,
258  ccl_global void *,
259  int,
260  ccl_global char *,
261  int,
262  int,
263  int,
264  int,
265  int,
266  int,
267  int,
268  int,
269  ccl_global int *,
270  int,
271  ccl_global char *,
272  ccl_global unsigned int *,
273  unsigned int,
274  ccl_global float *)>
276  unordered_map<string, KernelFunctions<void (*)(KernelGlobals *, KernelData *)>> split_kernels;
277 
278 #define KERNEL_FUNCTIONS(name) \
279  KERNEL_NAME_EVAL(cpu, name), KERNEL_NAME_EVAL(cpu_sse2, name), \
280  KERNEL_NAME_EVAL(cpu_sse3, name), KERNEL_NAME_EVAL(cpu_sse41, name), \
281  KERNEL_NAME_EVAL(cpu_avx, name), KERNEL_NAME_EVAL(cpu_avx2, name)
282 
283  CPUDevice(DeviceInfo &info_, Stats &stats_, Profiler &profiler_, bool background_)
284  : Device(info_, stats_, profiler_, background_),
285  texture_info(this, "__texture_info", MEM_GLOBAL),
286 #define REGISTER_KERNEL(name) name##_kernel(KERNEL_FUNCTIONS(name))
306 #undef REGISTER_KERNEL
307  {
308  if (info.cpu_threads == 0) {
310  }
311 
312 #ifdef WITH_OSL
313  kernel_globals.osl = &osl_globals;
314 #endif
315 #ifdef WITH_EMBREE
316  embree_device = rtcNewDevice("verbose=0");
317 #endif
319  if (use_split_kernel) {
320  VLOG(1) << "Will be using split kernel.";
321  }
322  need_texture_info = false;
323 
324 #define REGISTER_SPLIT_KERNEL(name) \
325  split_kernels[#name] = KernelFunctions<void (*)(KernelGlobals *, KernelData *)>( \
326  KERNEL_FUNCTIONS(name))
329  REGISTER_SPLIT_KERNEL(lamp_emission);
330  REGISTER_SPLIT_KERNEL(do_volume);
331  REGISTER_SPLIT_KERNEL(queue_enqueue);
333  REGISTER_SPLIT_KERNEL(shader_setup);
334  REGISTER_SPLIT_KERNEL(shader_sort);
335  REGISTER_SPLIT_KERNEL(shader_eval);
336  REGISTER_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao);
337  REGISTER_SPLIT_KERNEL(subsurface_scatter);
338  REGISTER_SPLIT_KERNEL(direct_lighting);
339  REGISTER_SPLIT_KERNEL(shadow_blocked_ao);
340  REGISTER_SPLIT_KERNEL(shadow_blocked_dl);
341  REGISTER_SPLIT_KERNEL(enqueue_inactive);
342  REGISTER_SPLIT_KERNEL(next_iteration_setup);
343  REGISTER_SPLIT_KERNEL(indirect_subsurface);
344  REGISTER_SPLIT_KERNEL(buffer_update);
345  REGISTER_SPLIT_KERNEL(adaptive_stopping);
346  REGISTER_SPLIT_KERNEL(adaptive_filter_x);
347  REGISTER_SPLIT_KERNEL(adaptive_filter_y);
348  REGISTER_SPLIT_KERNEL(adaptive_adjust_samples);
349 #undef REGISTER_SPLIT_KERNEL
350 #undef KERNEL_FUNCTIONS
351  }
352 
354  {
355 #ifdef WITH_EMBREE
356  rtcReleaseDevice(embree_device);
357 #endif
358  task_pool.cancel();
359  texture_info.free();
360  }
361 
362  virtual bool show_samples() const override
363  {
364  return (info.cpu_threads == 1);
365  }
366 
367  virtual BVHLayoutMask get_bvh_layout_mask() const override
368  {
369  BVHLayoutMask bvh_layout_mask = BVH_LAYOUT_BVH2;
370 #ifdef WITH_EMBREE
371  bvh_layout_mask |= BVH_LAYOUT_EMBREE;
372 #endif /* WITH_EMBREE */
373  return bvh_layout_mask;
374  }
375 
377  {
378  if (need_texture_info) {
380  need_texture_info = false;
381  }
382  }
383 
384  virtual void mem_alloc(device_memory &mem) override
385  {
386  if (mem.type == MEM_TEXTURE) {
387  assert(!"mem_alloc not supported for textures.");
388  }
389  else if (mem.type == MEM_GLOBAL) {
390  assert(!"mem_alloc not supported for global memory.");
391  }
392  else {
393  if (mem.name) {
394  VLOG(1) << "Buffer allocate: " << mem.name << ", "
395  << string_human_readable_number(mem.memory_size()) << " bytes. ("
396  << string_human_readable_size(mem.memory_size()) << ")";
397  }
398 
399  if (mem.type == MEM_DEVICE_ONLY || !mem.host_pointer) {
400  size_t alignment = MIN_ALIGNMENT_CPU_DATA_TYPES;
401  void *data = util_aligned_malloc(mem.memory_size(), alignment);
403  }
404  else {
406  }
407 
408  mem.device_size = mem.memory_size();
410  }
411  }
412 
413  virtual void mem_copy_to(device_memory &mem) override
414  {
415  if (mem.type == MEM_GLOBAL) {
416  global_free(mem);
417  global_alloc(mem);
418  }
419  else if (mem.type == MEM_TEXTURE) {
420  tex_free((device_texture &)mem);
421  tex_alloc((device_texture &)mem);
422  }
423  else if (mem.type == MEM_PIXELS) {
424  assert(!"mem_copy_to not supported for pixels.");
425  }
426  else {
427  if (!mem.device_pointer) {
428  mem_alloc(mem);
429  }
430 
431  /* copy is no-op */
432  }
433  }
434 
435  virtual void mem_copy_from(
436  device_memory & /*mem*/, int /*y*/, int /*w*/, int /*h*/, int /*elem*/) override
437  {
438  /* no-op */
439  }
440 
441  virtual void mem_zero(device_memory &mem) override
442  {
443  if (!mem.device_pointer) {
444  mem_alloc(mem);
445  }
446 
447  if (mem.device_pointer) {
448  memset((void *)mem.device_pointer, 0, mem.memory_size());
449  }
450  }
451 
452  virtual void mem_free(device_memory &mem) override
453  {
454  if (mem.type == MEM_GLOBAL) {
455  global_free(mem);
456  }
457  else if (mem.type == MEM_TEXTURE) {
458  tex_free((device_texture &)mem);
459  }
460  else if (mem.device_pointer) {
461  if (mem.type == MEM_DEVICE_ONLY || !mem.host_pointer) {
462  util_aligned_free((void *)mem.device_pointer);
463  }
464  mem.device_pointer = 0;
466  mem.device_size = 0;
467  }
468  }
469 
470  virtual device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/) override
471  {
472  return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset));
473  }
474 
475  virtual void const_copy_to(const char *name, void *host, size_t size) override
476  {
477 #if WITH_EMBREE
478  if (strcmp(name, "__data") == 0) {
479  assert(size <= sizeof(KernelData));
480 
481  // Update scene handle (since it is different for each device on multi devices)
482  KernelData *const data = (KernelData *)host;
483  data->bvh.scene = embree_scene;
484  }
485 #endif
486  kernel_const_copy(&kernel_globals, name, host, size);
487  }
488 
490  {
491  VLOG(1) << "Global memory allocate: " << mem.name << ", "
492  << string_human_readable_number(mem.memory_size()) << " bytes. ("
493  << string_human_readable_size(mem.memory_size()) << ")";
494 
496 
498  mem.device_size = mem.memory_size();
500  }
501 
503  {
504  if (mem.device_pointer) {
505  mem.device_pointer = 0;
507  mem.device_size = 0;
508  }
509  }
510 
512  {
513  VLOG(1) << "Texture allocate: " << mem.name << ", "
514  << string_human_readable_number(mem.memory_size()) << " bytes. ("
515  << string_human_readable_size(mem.memory_size()) << ")";
516 
518  mem.device_size = mem.memory_size();
520 
521  const uint slot = mem.slot;
522  if (slot >= texture_info.size()) {
523  /* Allocate some slots in advance, to reduce amount of re-allocations. */
524  texture_info.resize(slot + 128);
525  }
526 
527  texture_info[slot] = mem.info;
528  texture_info[slot].data = (uint64_t)mem.host_pointer;
529  need_texture_info = true;
530  }
531 
533  {
534  if (mem.device_pointer) {
535  mem.device_pointer = 0;
537  mem.device_size = 0;
538  need_texture_info = true;
539  }
540  }
541 
542  virtual void *osl_memory() override
543  {
544 #ifdef WITH_OSL
545  return &osl_globals;
546 #else
547  return NULL;
548 #endif
549  }
550 
551  void build_bvh(BVH *bvh, Progress &progress, bool refit) override
552  {
553 #ifdef WITH_EMBREE
554  if (bvh->params.bvh_layout == BVH_LAYOUT_EMBREE ||
556  BVHEmbree *const bvh_embree = static_cast<BVHEmbree *>(bvh);
557  if (refit) {
558  bvh_embree->refit(progress);
559  }
560  else {
561  bvh_embree->build(progress, &stats, embree_device);
562  }
563 
564  if (bvh->params.top_level) {
565  embree_scene = bvh_embree->scene;
566  }
567  }
568  else
569 #endif
570  Device::build_bvh(bvh, progress, refit);
571  }
572 
574  {
575  if (task.type == DeviceTask::RENDER)
577  else if (task.type == DeviceTask::SHADER)
579  else if (task.type == DeviceTask::FILM_CONVERT)
581  else if (task.type == DeviceTask::DENOISE_BUFFER)
583  }
584 
586  device_ptr guide_ptr,
587  device_ptr variance_ptr,
588  device_ptr out_ptr,
590  {
592 
593  int4 rect = task->rect;
594  int r = task->nlm_state.r;
595  int f = task->nlm_state.f;
596  float a = task->nlm_state.a;
597  float k_2 = task->nlm_state.k_2;
598 
599  int w = align_up(rect.z - rect.x, 4);
600  int h = rect.w - rect.y;
601  int stride = task->buffer.stride;
602  int channel_offset = task->nlm_state.is_color ? task->buffer.pass_stride : 0;
603 
604  float *temporary_mem = (float *)task->buffer.temporary_mem.device_pointer;
605  float *blurDifference = temporary_mem;
606  float *difference = temporary_mem + task->buffer.pass_stride;
607  float *weightAccum = temporary_mem + 2 * task->buffer.pass_stride;
608 
609  memset(weightAccum, 0, sizeof(float) * w * h);
610  memset((float *)out_ptr, 0, sizeof(float) * w * h);
611 
612  for (int i = 0; i < (2 * r + 1) * (2 * r + 1); i++) {
613  int dy = i / (2 * r + 1) - r;
614  int dx = i % (2 * r + 1) - r;
615 
616  int local_rect[4] = {
617  max(0, -dx), max(0, -dy), rect.z - rect.x - max(0, dx), rect.w - rect.y - max(0, dy)};
619  dy,
620  (float *)guide_ptr,
621  (float *)variance_ptr,
622  NULL,
623  difference,
624  local_rect,
625  w,
626  channel_offset,
627  0,
628  a,
629  k_2);
630 
631  filter_nlm_blur_kernel()(difference, blurDifference, local_rect, w, f);
632  filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, w, f);
633  filter_nlm_blur_kernel()(difference, blurDifference, local_rect, w, f);
634 
636  dy,
637  blurDifference,
638  (float *)image_ptr,
639  difference,
640  (float *)out_ptr,
641  weightAccum,
642  local_rect,
643  channel_offset,
644  stride,
645  f);
646  }
647 
648  int local_rect[4] = {0, 0, rect.z - rect.x, rect.w - rect.y};
649  filter_nlm_normalize_kernel()((float *)out_ptr, weightAccum, local_rect, w);
650 
651  return true;
652  }
653 
655  {
657 
658  for (int y = 0; y < task->filter_area.w; y++) {
659  for (int x = 0; x < task->filter_area.z; x++) {
660  filter_construct_transform_kernel()((float *)task->buffer.mem.device_pointer,
661  task->tile_info,
662  x + task->filter_area.x,
663  y + task->filter_area.y,
664  y * task->filter_area.z + x,
665  (float *)task->storage.transform.device_pointer,
666  (int *)task->storage.rank.device_pointer,
667  &task->rect.x,
668  task->buffer.pass_stride,
669  task->buffer.frame_stride,
670  task->buffer.use_time,
671  task->radius,
672  task->pca_threshold);
673  }
674  }
675  return true;
676  }
677 
679  device_ptr color_variance_ptr,
680  device_ptr scale_ptr,
681  int frame,
683  {
685 
686  float *temporary_mem = (float *)task->buffer.temporary_mem.device_pointer;
687  float *difference = temporary_mem;
688  float *blurDifference = temporary_mem + task->buffer.pass_stride;
689 
690  int r = task->radius;
691  int frame_offset = frame * task->buffer.frame_stride;
692  for (int i = 0; i < (2 * r + 1) * (2 * r + 1); i++) {
693  int dy = i / (2 * r + 1) - r;
694  int dx = i % (2 * r + 1) - r;
695 
696  int local_rect[4] = {max(0, -dx),
697  max(0, -dy),
698  task->reconstruction_state.source_w - max(0, dx),
699  task->reconstruction_state.source_h - max(0, dy)};
701  dy,
702  (float *)color_ptr,
703  (float *)color_variance_ptr,
704  (float *)scale_ptr,
705  difference,
706  local_rect,
707  task->buffer.stride,
708  task->buffer.pass_stride,
709  frame_offset,
710  1.0f,
711  task->nlm_k_2);
712  filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.stride, 4);
714  blurDifference, difference, local_rect, task->buffer.stride, 4);
715  filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.stride, 4);
717  dy,
718  task->tile_info->frames[frame],
719  blurDifference,
720  (float *)task->buffer.mem.device_pointer,
721  (float *)task->storage.transform.device_pointer,
722  (int *)task->storage.rank.device_pointer,
723  (float *)task->storage.XtWX.device_pointer,
724  (float3 *)task->storage.XtWY.device_pointer,
725  local_rect,
726  &task->reconstruction_state.filter_window.x,
727  task->buffer.stride,
728  4,
729  task->buffer.pass_stride,
730  frame_offset,
731  task->buffer.use_time);
732  }
733 
734  return true;
735  }
736 
738  {
739  for (int y = 0; y < task->filter_area.w; y++) {
740  for (int x = 0; x < task->filter_area.z; x++) {
742  y,
743  y * task->filter_area.z + x,
744  (float *)output_ptr,
745  (int *)task->storage.rank.device_pointer,
746  (float *)task->storage.XtWX.device_pointer,
747  (float3 *)task->storage.XtWY.device_pointer,
748  &task->reconstruction_state.buffer_params.x,
749  task->render_buffer.samples);
750  }
751  }
752  return true;
753  }
754 
756  device_ptr b_ptr,
757  device_ptr mean_ptr,
758  device_ptr variance_ptr,
759  int r,
760  int4 rect,
762  {
764 
765  for (int y = rect.y; y < rect.w; y++) {
766  for (int x = rect.x; x < rect.z; x++) {
768  y,
769  (float *)mean_ptr,
770  (float *)variance_ptr,
771  (float *)a_ptr,
772  (float *)b_ptr,
773  &rect.x,
774  r);
775  }
776  }
777  return true;
778  }
779 
781  device_ptr b_ptr,
782  device_ptr sample_variance_ptr,
783  device_ptr sv_variance_ptr,
784  device_ptr buffer_variance_ptr,
786  {
788 
789  for (int y = task->rect.y; y < task->rect.w; y++) {
790  for (int x = task->rect.x; x < task->rect.z; x++) {
791  filter_divide_shadow_kernel()(task->render_buffer.samples,
792  task->tile_info,
793  x,
794  y,
795  (float *)a_ptr,
796  (float *)b_ptr,
797  (float *)sample_variance_ptr,
798  (float *)sv_variance_ptr,
799  (float *)buffer_variance_ptr,
800  &task->rect.x,
801  task->render_buffer.pass_stride,
802  task->render_buffer.offset);
803  }
804  }
805  return true;
806  }
807 
808  bool denoising_get_feature(int mean_offset,
809  int variance_offset,
810  device_ptr mean_ptr,
811  device_ptr variance_ptr,
812  float scale,
814  {
816 
817  for (int y = task->rect.y; y < task->rect.w; y++) {
818  for (int x = task->rect.x; x < task->rect.z; x++) {
819  filter_get_feature_kernel()(task->render_buffer.samples,
820  task->tile_info,
821  mean_offset,
822  variance_offset,
823  x,
824  y,
825  (float *)mean_ptr,
826  (float *)variance_ptr,
827  scale,
828  &task->rect.x,
829  task->render_buffer.pass_stride,
830  task->render_buffer.offset);
831  }
832  }
833  return true;
834  }
835 
836  bool denoising_write_feature(int out_offset,
837  device_ptr from_ptr,
838  device_ptr buffer_ptr,
840  {
841  for (int y = 0; y < task->filter_area.w; y++) {
842  for (int x = 0; x < task->filter_area.z; x++) {
843  filter_write_feature_kernel()(task->render_buffer.samples,
844  x + task->filter_area.x,
845  y + task->filter_area.y,
846  &task->reconstruction_state.buffer_params.x,
847  (float *)from_ptr,
848  (float *)buffer_ptr,
849  out_offset,
850  &task->rect.x);
851  }
852  }
853  return true;
854  }
855 
857  device_ptr variance_ptr,
858  device_ptr depth_ptr,
859  device_ptr output_ptr,
861  {
863 
864  for (int y = task->rect.y; y < task->rect.w; y++) {
865  for (int x = task->rect.x; x < task->rect.z; x++) {
867  y,
868  (float *)image_ptr,
869  (float *)variance_ptr,
870  (float *)depth_ptr,
871  (float *)output_ptr,
872  &task->rect.x,
873  task->buffer.pass_stride);
874  }
875  }
876  return true;
877  }
878 
879  bool adaptive_sampling_filter(KernelGlobals *kg, RenderTile &tile, int sample)
880  {
881  WorkTile wtile;
882  wtile.x = tile.x;
883  wtile.y = tile.y;
884  wtile.w = tile.w;
885  wtile.h = tile.h;
886  wtile.offset = tile.offset;
887  wtile.stride = tile.stride;
888  wtile.buffer = (float *)tile.buffer;
889 
890  /* For CPU we do adaptive stopping per sample so we can stop earlier, but
891  * for combined CPU + GPU rendering we match the GPU and do it per tile
892  * after a given number of sample steps. */
893  if (!kernel_data.integrator.adaptive_stop_per_sample) {
894  for (int y = wtile.y; y < wtile.y + wtile.h; ++y) {
895  for (int x = wtile.x; x < wtile.x + wtile.w; ++x) {
896  const int index = wtile.offset + x + y * wtile.stride;
897  float *buffer = wtile.buffer + index * kernel_data.film.pass_stride;
899  }
900  }
901  }
902 
903  bool any = false;
904  for (int y = wtile.y; y < wtile.y + wtile.h; ++y) {
905  any |= kernel_do_adaptive_filter_x(kg, y, &wtile);
906  }
907  for (int x = wtile.x; x < wtile.x + wtile.w; ++x) {
908  any |= kernel_do_adaptive_filter_y(kg, x, &wtile);
909  }
910  return (!any);
911  }
912 
913  void adaptive_sampling_post(const RenderTile &tile, KernelGlobals *kg)
914  {
915  float *render_buffer = (float *)tile.buffer;
916  for (int y = tile.y; y < tile.y + tile.h; y++) {
917  for (int x = tile.x; x < tile.x + tile.w; x++) {
918  int index = tile.offset + x + y * tile.stride;
919  ccl_global float *buffer = render_buffer + index * kernel_data.film.pass_stride;
920  if (buffer[kernel_data.film.pass_sample_count] < 0.0f) {
921  buffer[kernel_data.film.pass_sample_count] = -buffer[kernel_data.film.pass_sample_count];
922  float sample_multiplier = tile.sample / buffer[kernel_data.film.pass_sample_count];
923  if (sample_multiplier != 1.0f) {
924  kernel_adaptive_post_adjust(kg, buffer, sample_multiplier);
925  }
926  }
927  else {
928  kernel_adaptive_post_adjust(kg, buffer, tile.sample / (tile.sample - 1.0f));
929  }
930  }
931  }
932  }
933 
934  void render(DeviceTask &task, RenderTile &tile, KernelGlobals *kg)
935  {
936  const bool use_coverage = kernel_data.film.cryptomatte_passes & CRYPT_ACCURATE;
937 
938  scoped_timer timer(&tile.buffers->render_time);
939 
940  Coverage coverage(kg, tile);
941  if (use_coverage) {
942  coverage.init_path_trace();
943  }
944 
945  float *render_buffer = (float *)tile.buffer;
946  int start_sample = tile.start_sample;
947  int end_sample = tile.start_sample + tile.num_samples;
948 
949  /* Needed for Embree. */
951 
952  for (int sample = start_sample; sample < end_sample; sample++) {
953  if (task.get_cancel() || TaskPool::canceled()) {
954  if (task.need_finish_queue == false)
955  break;
956  }
957 
958  if (tile.stealing_state == RenderTile::CAN_BE_STOLEN && task.get_tile_stolen()) {
960  break;
961  }
962 
963  if (tile.task == RenderTile::PATH_TRACE) {
964  for (int y = tile.y; y < tile.y + tile.h; y++) {
965  for (int x = tile.x; x < tile.x + tile.w; x++) {
966  if (use_coverage) {
967  coverage.init_pixel(x, y);
968  }
969  path_trace_kernel()(kg, render_buffer, sample, x, y, tile.offset, tile.stride);
970  }
971  }
972  }
973  else {
974  for (int y = tile.y; y < tile.y + tile.h; y++) {
975  for (int x = tile.x; x < tile.x + tile.w; x++) {
976  bake_kernel()(kg, render_buffer, sample, x, y, tile.offset, tile.stride);
977  }
978  }
979  }
980  tile.sample = sample + 1;
981 
982  if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(sample)) {
983  const bool stop = adaptive_sampling_filter(kg, tile, sample);
984  if (stop) {
985  const int num_progress_samples = end_sample - sample;
986  tile.sample = end_sample;
987  task.update_progress(&tile, tile.w * tile.h * num_progress_samples);
988  break;
989  }
990  }
991 
992  task.update_progress(&tile, tile.w * tile.h);
993  }
994  if (use_coverage) {
995  coverage.finalize();
996  }
997 
998  if (task.adaptive_sampling.use && (tile.stealing_state != RenderTile::WAS_STOLEN)) {
999  adaptive_sampling_post(tile, kg);
1000  }
1001  }
1002 
1004  float *buffer,
1005  const size_t offset,
1006  const size_t stride,
1007  const size_t x,
1008  const size_t y,
1009  const size_t w,
1010  const size_t h,
1011  const float scale)
1012  {
1013 #ifdef WITH_OPENIMAGEDENOISE
1014  assert(openimagedenoise_supported());
1015 
1016  /* Only one at a time, since OpenImageDenoise itself is multithreaded for full
1017  * buffers, and for tiled rendering because creating multiple devices and filters
1018  * is slow and memory hungry as well.
1019  *
1020  * TODO: optimize tiled rendering case, by batching together denoising of many
1021  * tiles somehow? */
1022  static thread_mutex mutex;
1023  thread_scoped_lock lock(mutex);
1024 
1025  /* Create device and filter, cached for reuse. */
1026  if (!oidn_device) {
1027  oidn_device = oidn::newDevice();
1028  oidn_device.commit();
1029  }
1030  if (!oidn_filter) {
1031  oidn_filter = oidn_device.newFilter("RT");
1032  oidn_filter.set("hdr", true);
1033  oidn_filter.set("srgb", false);
1034  }
1035 
1036  /* Set images with appropriate stride for our interleaved pass storage. */
1037  struct {
1038  const char *name;
1039  const int offset;
1040  const bool scale;
1041  const bool use;
1042  array<float> scaled_buffer;
1043  } passes[] = {{"color", task.pass_denoising_data + DENOISING_PASS_COLOR, false, true},
1044  {"albedo",
1045  task.pass_denoising_data + DENOISING_PASS_ALBEDO,
1046  true,
1047  task.denoising.input_passes >= DENOISER_INPUT_RGB_ALBEDO},
1048  {"normal",
1049  task.pass_denoising_data + DENOISING_PASS_NORMAL,
1050  true,
1051  task.denoising.input_passes >= DENOISER_INPUT_RGB_ALBEDO_NORMAL},
1052  {"output", 0, false, true},
1053  { NULL,
1054  0 }};
1055 
1056  for (int i = 0; passes[i].name; i++) {
1057  if (!passes[i].use) {
1058  continue;
1059  }
1060 
1061  const int64_t pixel_offset = offset + x + y * stride;
1062  const int64_t buffer_offset = (pixel_offset * task.pass_stride + passes[i].offset);
1063  const int64_t pixel_stride = task.pass_stride;
1064  const int64_t row_stride = stride * pixel_stride;
1065 
1066  if (passes[i].scale && scale != 1.0f) {
1067  /* Normalize albedo and normal passes as they are scaled by the number of samples.
1068  * For the color passes OIDN will perform auto-exposure making it unnecessary. */
1069  array<float> &scaled_buffer = passes[i].scaled_buffer;
1070  scaled_buffer.resize(w * h * 3);
1071 
1072  for (int y = 0; y < h; y++) {
1073  const float *pass_row = buffer + buffer_offset + y * row_stride;
1074  float *scaled_row = scaled_buffer.data() + y * w * 3;
1075 
1076  for (int x = 0; x < w; x++) {
1077  scaled_row[x * 3 + 0] = pass_row[x * pixel_stride + 0] * scale;
1078  scaled_row[x * 3 + 1] = pass_row[x * pixel_stride + 1] * scale;
1079  scaled_row[x * 3 + 2] = pass_row[x * pixel_stride + 2] * scale;
1080  }
1081  }
1082 
1083  oidn_filter.setImage(
1084  passes[i].name, scaled_buffer.data(), oidn::Format::Float3, w, h, 0, 0, 0);
1085  }
1086  else {
1087  oidn_filter.setImage(passes[i].name,
1088  buffer + buffer_offset,
1089  oidn::Format::Float3,
1090  w,
1091  h,
1092  0,
1093  pixel_stride * sizeof(float),
1094  row_stride * sizeof(float));
1095  }
1096  }
1097 
1098  /* Execute filter. */
1099  oidn_filter.commit();
1100  oidn_filter.execute();
1101 #else
1102  (void)task;
1103  (void)buffer;
1104  (void)offset;
1105  (void)stride;
1106  (void)x;
1107  (void)y;
1108  (void)w;
1109  (void)h;
1110  (void)scale;
1111 #endif
1112  }
1113 
1115  {
1116  if (task.type == DeviceTask::DENOISE_BUFFER) {
1117  /* Copy pixels from compute device to CPU (no-op for CPU device). */
1118  rtile.buffers->buffer.copy_from_device();
1119 
1121  (float *)rtile.buffer,
1122  rtile.offset,
1123  rtile.stride,
1124  rtile.x,
1125  rtile.y,
1126  rtile.w,
1127  rtile.h,
1128  1.0f / rtile.sample);
1129 
1130  /* todo: it may be possible to avoid this copy, but we have to ensure that
1131  * when other code copies data from the device it doesn't overwrite the
1132  * denoiser buffers. */
1133  rtile.buffers->buffer.copy_to_device();
1134  }
1135  else {
1136  /* Per-tile denoising. */
1137  rtile.sample = rtile.start_sample + rtile.num_samples;
1138  const float scale = 1.0f / rtile.sample;
1139  const float invscale = rtile.sample;
1140  const size_t pass_stride = task.pass_stride;
1141 
1142  /* Map neighboring tiles into one buffer for denoising. */
1143  RenderTileNeighbors neighbors(rtile);
1144  task.map_neighbor_tiles(neighbors, this);
1145  RenderTile &center_tile = neighbors.tiles[RenderTileNeighbors::CENTER];
1146  rtile = center_tile;
1147 
1148  /* Calculate size of the tile to denoise (including overlap). The overlap
1149  * size was chosen empirically. OpenImageDenoise specifies an overlap size
1150  * of 128 but this is significantly bigger than typical tile size. */
1151  const int4 rect = rect_clip(rect_expand(center_tile.bounds(), 64), neighbors.bounds());
1152  const int2 rect_size = make_int2(rect.z - rect.x, rect.w - rect.y);
1153 
1154  /* Adjacent tiles are in separate memory regions, copy into single buffer. */
1155  array<float> merged(rect_size.x * rect_size.y * task.pass_stride);
1156 
1157  for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
1158  RenderTile &ntile = neighbors.tiles[i];
1159  if (!ntile.buffer) {
1160  continue;
1161  }
1162 
1163  const int xmin = max(ntile.x, rect.x);
1164  const int ymin = max(ntile.y, rect.y);
1165  const int xmax = min(ntile.x + ntile.w, rect.z);
1166  const int ymax = min(ntile.y + ntile.h, rect.w);
1167 
1168  const size_t tile_offset = ntile.offset + xmin + ymin * ntile.stride;
1169  const float *tile_buffer = (float *)ntile.buffer + tile_offset * pass_stride;
1170 
1171  const size_t merged_stride = rect_size.x;
1172  const size_t merged_offset = (xmin - rect.x) + (ymin - rect.y) * merged_stride;
1173  float *merged_buffer = merged.data() + merged_offset * pass_stride;
1174 
1175  for (int y = ymin; y < ymax; y++) {
1176  for (int x = 0; x < pass_stride * (xmax - xmin); x++) {
1177  merged_buffer[x] = tile_buffer[x] * scale;
1178  }
1179  tile_buffer += ntile.stride * pass_stride;
1180  merged_buffer += merged_stride * pass_stride;
1181  }
1182  }
1183 
1184  /* Denoise */
1186  task, merged.data(), 0, rect_size.x, 0, 0, rect_size.x, rect_size.y, 1.0f);
1187 
1188  /* Copy back result from merged buffer. */
1189  RenderTile &ntile = neighbors.target;
1190  if (ntile.buffer) {
1191  const int xmin = max(ntile.x, rect.x);
1192  const int ymin = max(ntile.y, rect.y);
1193  const int xmax = min(ntile.x + ntile.w, rect.z);
1194  const int ymax = min(ntile.y + ntile.h, rect.w);
1195 
1196  const size_t tile_offset = ntile.offset + xmin + ymin * ntile.stride;
1197  float *tile_buffer = (float *)ntile.buffer + tile_offset * pass_stride;
1198 
1199  const size_t merged_stride = rect_size.x;
1200  const size_t merged_offset = (xmin - rect.x) + (ymin - rect.y) * merged_stride;
1201  const float *merged_buffer = merged.data() + merged_offset * pass_stride;
1202 
1203  for (int y = ymin; y < ymax; y++) {
1204  for (int x = 0; x < pass_stride * (xmax - xmin); x += pass_stride) {
1205  tile_buffer[x + 0] = merged_buffer[x + 0] * invscale;
1206  tile_buffer[x + 1] = merged_buffer[x + 1] * invscale;
1207  tile_buffer[x + 2] = merged_buffer[x + 2] * invscale;
1208  }
1209  tile_buffer += ntile.stride * pass_stride;
1210  merged_buffer += merged_stride * pass_stride;
1211  }
1212  }
1213 
1214  task.unmap_neighbor_tiles(neighbors, this);
1215  }
1216  }
1217 
1218  void denoise_nlm(DenoisingTask &denoising, RenderTile &tile)
1219  {
1220  ProfilingHelper profiling(denoising.profiler, PROFILING_DENOISING);
1221 
1222  tile.sample = tile.start_sample + tile.num_samples;
1223 
1225  &CPUDevice::denoising_construct_transform, this, &denoising);
1226  denoising.functions.accumulate = function_bind(
1227  &CPUDevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising);
1228  denoising.functions.solve = function_bind(&CPUDevice::denoising_solve, this, _1, &denoising);
1230  &CPUDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
1232  &CPUDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
1234  &CPUDevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
1235  denoising.functions.get_feature = function_bind(
1236  &CPUDevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising);
1238  &CPUDevice::denoising_write_feature, this, _1, _2, _3, &denoising);
1240  &CPUDevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
1241 
1242  denoising.filter_area = make_int4(tile.x, tile.y, tile.w, tile.h);
1243  denoising.render_buffer.samples = tile.sample;
1244  denoising.buffer.gpu_temporary_mem = false;
1245 
1246  denoising.run_denoising(tile);
1247  }
1248 
1250  {
1251  if (TaskPool::canceled()) {
1252  if (task.need_finish_queue == false)
1253  return;
1254  }
1255 
1256  /* allocate buffer for kernel globals */
1257  device_only_memory<KernelGlobals> kgbuffer(this, "kernel_globals");
1258  kgbuffer.alloc_to_device(1);
1259 
1260  KernelGlobals *kg = new ((void *)kgbuffer.device_pointer)
1261  KernelGlobals(thread_kernel_globals_init());
1262 
1263  profiler.add_state(&kg->profiler);
1264 
1265  CPUSplitKernel *split_kernel = NULL;
1266  if (use_split_kernel) {
1267  split_kernel = new CPUSplitKernel(this);
1268  if (!split_kernel->load_kernels(requested_features)) {
1269  thread_kernel_globals_free((KernelGlobals *)kgbuffer.device_pointer);
1270  kgbuffer.free();
1271  delete split_kernel;
1272  return;
1273  }
1274  }
1275 
1276  /* NLM denoiser. */
1277  DenoisingTask *denoising = NULL;
1278 
1279  /* OpenImageDenoise: we can only denoise with one thread at a time, so to
1280  * avoid waiting with mutex locks in the denoiser, we let only a single
1281  * thread acquire denoising tiles. */
1282  uint tile_types = task.tile_types;
1283  bool hold_denoise_lock = false;
1284  if ((tile_types & RenderTile::DENOISE) && task.denoising.type == DENOISER_OPENIMAGEDENOISE) {
1285  if (!oidn_task_lock.try_lock()) {
1286  tile_types &= ~RenderTile::DENOISE;
1287  hold_denoise_lock = true;
1288  }
1289  }
1290 
1291  RenderTile tile;
1292  while (task.acquire_tile(this, tile, tile_types)) {
1293  if (tile.task == RenderTile::PATH_TRACE) {
1294  if (use_split_kernel) {
1295  device_only_memory<uchar> void_buffer(this, "void_buffer");
1296  split_kernel->path_trace(task, tile, kgbuffer, void_buffer);
1297  }
1298  else {
1299  render(task, tile, kg);
1300  }
1301  }
1302  else if (tile.task == RenderTile::BAKE) {
1303  render(task, tile, kg);
1304  }
1305  else if (tile.task == RenderTile::DENOISE) {
1306  if (task.denoising.type == DENOISER_OPENIMAGEDENOISE) {
1308  }
1309  else if (task.denoising.type == DENOISER_NLM) {
1310  if (denoising == NULL) {
1311  denoising = new DenoisingTask(this, task);
1312  denoising->profiler = &kg->profiler;
1313  }
1314  denoise_nlm(*denoising, tile);
1315  }
1316  task.update_progress(&tile, tile.w * tile.h);
1317  }
1318 
1319  task.release_tile(tile);
1320 
1321  if (TaskPool::canceled()) {
1322  if (task.need_finish_queue == false)
1323  break;
1324  }
1325  }
1326 
1327  if (hold_denoise_lock) {
1328  oidn_task_lock.unlock();
1329  }
1330 
1331  profiler.remove_state(&kg->profiler);
1332 
1333  thread_kernel_globals_free((KernelGlobals *)kgbuffer.device_pointer);
1334  kg->~KernelGlobals();
1335  kgbuffer.free();
1336  delete split_kernel;
1337  delete denoising;
1338  }
1339 
1341  {
1342  RenderTile tile;
1343  tile.x = task.x;
1344  tile.y = task.y;
1345  tile.w = task.w;
1346  tile.h = task.h;
1347  tile.buffer = task.buffer;
1348  tile.sample = task.sample + task.num_samples;
1349  tile.num_samples = task.num_samples;
1350  tile.start_sample = task.sample;
1351  tile.offset = task.offset;
1352  tile.stride = task.stride;
1353  tile.buffers = task.buffers;
1354 
1355  if (task.denoising.type == DENOISER_OPENIMAGEDENOISE) {
1357  }
1358  else {
1359  DenoisingTask denoising(this, task);
1360 
1361  ProfilingState denoising_profiler_state;
1362  profiler.add_state(&denoising_profiler_state);
1363  denoising.profiler = &denoising_profiler_state;
1364 
1365  denoise_nlm(denoising, tile);
1366 
1367  profiler.remove_state(&denoising_profiler_state);
1368  }
1369 
1370  task.update_progress(&tile, tile.w * tile.h);
1371  }
1372 
1374  {
1375  float sample_scale = 1.0f / (task.sample + 1);
1376 
1377  if (task.rgba_half) {
1378  for (int y = task.y; y < task.y + task.h; y++)
1379  for (int x = task.x; x < task.x + task.w; x++)
1381  (uchar4 *)task.rgba_half,
1382  (float *)task.buffer,
1383  sample_scale,
1384  x,
1385  y,
1386  task.offset,
1387  task.stride);
1388  }
1389  else {
1390  for (int y = task.y; y < task.y + task.h; y++)
1391  for (int x = task.x; x < task.x + task.w; x++)
1393  (uchar4 *)task.rgba_byte,
1394  (float *)task.buffer,
1395  sample_scale,
1396  x,
1397  y,
1398  task.offset,
1399  task.stride);
1400  }
1401  }
1402 
1404  {
1405  KernelGlobals *kg = new KernelGlobals(thread_kernel_globals_init());
1406 
1407  for (int sample = 0; sample < task.num_samples; sample++) {
1408  for (int x = task.shader_x; x < task.shader_x + task.shader_w; x++)
1409  shader_kernel()(kg,
1410  (uint4 *)task.shader_input,
1411  (float4 *)task.shader_output,
1412  task.shader_eval_type,
1413  task.shader_filter,
1414  x,
1415  task.offset,
1416  sample);
1417 
1418  if (task.get_cancel() || TaskPool::canceled())
1419  break;
1420 
1421  task.update_progress(NULL);
1422  }
1423 
1425  delete kg;
1426  }
1427 
1428  virtual int get_split_task_count(DeviceTask &task) override
1429  {
1430  if (task.type == DeviceTask::SHADER)
1431  return task.get_subtask_count(info.cpu_threads, 256);
1432  else
1433  return task.get_subtask_count(info.cpu_threads);
1434  }
1435 
1436  virtual void task_add(DeviceTask &task) override
1437  {
1438  /* Load texture info. */
1440 
1441  /* split task into smaller ones */
1442  list<DeviceTask> tasks;
1443 
1444  if (task.type == DeviceTask::DENOISE_BUFFER &&
1445  task.denoising.type == DENOISER_OPENIMAGEDENOISE) {
1446  /* Denoise entire buffer at once with OIDN, it has own threading. */
1447  tasks.push_back(task);
1448  }
1449  else if (task.type == DeviceTask::SHADER) {
1450  task.split(tasks, info.cpu_threads, 256);
1451  }
1452  else {
1453  task.split(tasks, info.cpu_threads);
1454  }
1455 
1456  foreach (DeviceTask &task, tasks) {
1457  task_pool.push([=] {
1458  DeviceTask task_copy = task;
1459  thread_run(task_copy);
1460  });
1461  }
1462  }
1463 
1464  virtual void task_wait() override
1465  {
1466  task_pool.wait_work();
1467  }
1468 
1469  virtual void task_cancel() override
1470  {
1471  task_pool.cancel();
1472  }
1473 
1474  protected:
1475  inline KernelGlobals thread_kernel_globals_init()
1476  {
1477  KernelGlobals kg = kernel_globals;
1478  kg.transparent_shadow_intersections = NULL;
1479  const int decoupled_count = sizeof(kg.decoupled_volume_steps) /
1480  sizeof(*kg.decoupled_volume_steps);
1481  for (int i = 0; i < decoupled_count; ++i) {
1482  kg.decoupled_volume_steps[i] = NULL;
1483  }
1484  kg.decoupled_volume_steps_index = 0;
1485  kg.coverage_asset = kg.coverage_object = kg.coverage_material = NULL;
1486 #ifdef WITH_OSL
1487  OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
1488 #endif
1489  return kg;
1490  }
1491 
1492  inline void thread_kernel_globals_free(KernelGlobals *kg)
1493  {
1494  if (kg == NULL) {
1495  return;
1496  }
1497 
1498  if (kg->transparent_shadow_intersections != NULL) {
1499  free(kg->transparent_shadow_intersections);
1500  }
1501  const int decoupled_count = sizeof(kg->decoupled_volume_steps) /
1502  sizeof(*kg->decoupled_volume_steps);
1503  for (int i = 0; i < decoupled_count; ++i) {
1504  if (kg->decoupled_volume_steps[i] != NULL) {
1505  free(kg->decoupled_volume_steps[i]);
1506  }
1507  }
1508 #ifdef WITH_OSL
1509  OSLShader::thread_free(kg);
1510 #endif
1511  }
1512 
1513  virtual bool load_kernels(const DeviceRequestedFeatures &requested_features_) override
1514  {
1515  requested_features = requested_features_;
1516 
1517  return true;
1518  }
1519 };
1520 
1521 /* split kernel */
1522 
1524  public:
1526  void (*func)(KernelGlobals *kg, KernelData *data);
1527 
1529  {
1530  }
1532  {
1533  }
1534 
1535  virtual bool enqueue(const KernelDimensions &dim,
1536  device_memory &kernel_globals,
1538  {
1539  if (!func) {
1540  return false;
1541  }
1542 
1543  KernelGlobals *kg = (KernelGlobals *)kernel_globals.device_pointer;
1544  kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]);
1545 
1546  for (int y = 0; y < dim.global_size[1]; y++) {
1547  for (int x = 0; x < dim.global_size[0]; x++) {
1548  kg->global_id = make_int2(x, y);
1549 
1550  func(kg, (KernelData *)data.device_pointer);
1551  }
1552  }
1553 
1554  return true;
1555  }
1556 };
1557 
1559 {
1560 }
1561 
1563  RenderTile &rtile,
1564  int num_global_elements,
1565  device_memory &kernel_globals,
1567  device_memory &split_data,
1570  device_memory &use_queues_flags,
1571  device_memory &work_pool_wgs)
1572 {
1573  KernelGlobals *kg = (KernelGlobals *)kernel_globals.device_pointer;
1574  kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]);
1575 
1576  for (int y = 0; y < dim.global_size[1]; y++) {
1577  for (int x = 0; x < dim.global_size[0]; x++) {
1578  kg->global_id = make_int2(x, y);
1579 
1580  device->data_init_kernel()((KernelGlobals *)kernel_globals.device_pointer,
1581  (KernelData *)data.device_pointer,
1582  (void *)split_data.device_pointer,
1583  num_global_elements,
1584  (char *)ray_state.device_pointer,
1585  rtile.start_sample,
1586  rtile.start_sample + rtile.num_samples,
1587  rtile.x,
1588  rtile.y,
1589  rtile.w,
1590  rtile.h,
1591  rtile.offset,
1592  rtile.stride,
1593  (int *)queue_index.device_pointer,
1594  dim.global_size[0] * dim.global_size[1],
1595  (char *)use_queues_flags.device_pointer,
1596  (uint *)work_pool_wgs.device_pointer,
1597  rtile.num_samples,
1598  (float *)rtile.buffer);
1599  }
1600  }
1601 
1602  return true;
1603 }
1604 
1606  const DeviceRequestedFeatures &)
1607 {
1608  CPUSplitKernelFunction *kernel = new CPUSplitKernelFunction(device);
1609 
1610  kernel->func = device->split_kernels[kernel_name]();
1611  if (!kernel->func) {
1612  delete kernel;
1613  return NULL;
1614  }
1615 
1616  return kernel;
1617 }
1618 
1620 {
1621  return make_int2(1, 1);
1622 }
1623 
1625  device_memory & /*data*/,
1626  DeviceTask & /*task*/)
1627 {
1628  return make_int2(1, 1);
1629 }
1630 
1632  device_memory & /*data*/,
1633  size_t num_threads)
1634 {
1635  KernelGlobals *kg = (KernelGlobals *)kernel_globals.device_pointer;
1636 
1637  return split_data_buffer_size(kg, num_threads);
1638 }
1639 
1640 Device *device_cpu_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
1641 {
1642  return new CPUDevice(info, stats, profiler, background);
1643 }
1644 
1646 {
1647  DeviceInfo info;
1648 
1649  info.type = DEVICE_CPU;
1651  info.id = "CPU";
1652  info.num = 0;
1653  info.has_volume_decoupled = true;
1654  info.has_adaptive_stop_per_sample = true;
1655  info.has_osl = true;
1656  info.has_half_images = true;
1657  info.has_nanovdb = true;
1658  info.has_profiling = true;
1659  info.denoisers = DENOISER_NLM;
1662  }
1663 
1664  devices.insert(devices.begin(), info);
1665 }
1666 
1668 {
1669  string capabilities = "";
1670  capabilities += system_cpu_support_sse2() ? "SSE2 " : "";
1671  capabilities += system_cpu_support_sse3() ? "SSE3 " : "";
1672  capabilities += system_cpu_support_sse41() ? "SSE41 " : "";
1673  capabilities += system_cpu_support_avx() ? "AVX " : "";
1674  capabilities += system_cpu_support_avx2() ? "AVX2" : "";
1675  if (capabilities[capabilities.size() - 1] == ' ')
1676  capabilities.resize(capabilities.size() - 1);
1677  return capabilities;
1678 }
1679 
typedef float(TangentPoint)[2]
void BLI_kdtree_nd_() free(KDTree *tree)
Definition: kdtree_impl.h:116
unsigned int uint
Definition: BLI_sys_types.h:83
ThreadMutex mutex
_GL_VOID GLfloat value _GL_VOID_RET _GL_VOID const GLuint GLboolean *residences _GL_BOOL_RET _GL_VOID GLsizei GLfloat GLfloat GLfloat GLfloat const GLubyte *bitmap _GL_VOID_RET _GL_VOID GLenum const void *lists _GL_VOID_RET _GL_VOID const GLdouble *equation _GL_VOID_RET _GL_VOID GLdouble GLdouble blue _GL_VOID_RET _GL_VOID GLfloat GLfloat blue _GL_VOID_RET _GL_VOID GLint GLint blue _GL_VOID_RET _GL_VOID GLshort GLshort blue _GL_VOID_RET _GL_VOID GLubyte GLubyte blue _GL_VOID_RET _GL_VOID GLuint GLuint blue _GL_VOID_RET _GL_VOID GLushort GLushort blue _GL_VOID_RET _GL_VOID GLbyte GLbyte GLbyte alpha _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble alpha _GL_VOID_RET _GL_VOID GLfloat GLfloat GLfloat alpha _GL_VOID_RET _GL_VOID GLint GLint GLint alpha _GL_VOID_RET _GL_VOID GLshort GLshort GLshort alpha _GL_VOID_RET _GL_VOID GLubyte GLubyte GLubyte alpha _GL_VOID_RET _GL_VOID GLuint GLuint GLuint alpha _GL_VOID_RET _GL_VOID GLushort GLushort GLushort alpha _GL_VOID_RET _GL_VOID GLenum mode _GL_VOID_RET _GL_VOID GLint GLsizei GLsizei GLenum type _GL_VOID_RET _GL_VOID GLsizei GLenum GLenum const void *pixels _GL_VOID_RET _GL_VOID const void *pointer _GL_VOID_RET _GL_VOID GLdouble v _GL_VOID_RET _GL_VOID GLfloat v _GL_VOID_RET _GL_VOID GLint GLint i2 _GL_VOID_RET _GL_VOID GLint j _GL_VOID_RET _GL_VOID GLfloat param _GL_VOID_RET _GL_VOID GLint param _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble GLdouble GLdouble zFar _GL_VOID_RET _GL_UINT GLdouble *equation _GL_VOID_RET _GL_VOID GLenum GLint *params _GL_VOID_RET _GL_VOID GLenum GLfloat *v _GL_VOID_RET _GL_VOID GLenum GLfloat *params _GL_VOID_RET _GL_VOID GLfloat *values _GL_VOID_RET _GL_VOID GLushort *values _GL_VOID_RET _GL_VOID GLenum GLfloat *params _GL_VOID_RET _GL_VOID GLenum GLdouble *params _GL_VOID_RET _GL_VOID GLenum GLint *params _GL_VOID_RET _GL_VOID GLsizei const void *pointer _GL_VOID_RET _GL_VOID GLsizei const void *pointer _GL_VOID_RET _GL_BOOL GLfloat param _GL_VOID_RET _GL_VOID GLint param _GL_VOID_RET _GL_VOID GLenum GLfloat param _GL_VOID_RET _GL_VOID GLenum GLint param _GL_VOID_RET _GL_VOID GLushort pattern _GL_VOID_RET _GL_VOID GLdouble GLdouble GLint GLint const GLdouble *points _GL_VOID_RET _GL_VOID GLdouble GLdouble GLint GLint GLdouble GLdouble GLint GLint const GLdouble *points _GL_VOID_RET _GL_VOID GLdouble GLdouble u2 _GL_VOID_RET _GL_VOID GLdouble GLdouble GLint GLdouble GLdouble v2 _GL_VOID_RET _GL_VOID GLenum GLfloat param _GL_VOID_RET _GL_VOID GLenum GLint param _GL_VOID_RET _GL_VOID GLenum mode _GL_VOID_RET _GL_VOID GLdouble GLdouble nz _GL_VOID_RET _GL_VOID GLfloat GLfloat nz _GL_VOID_RET _GL_VOID GLint GLint nz _GL_VOID_RET _GL_VOID GLshort GLshort nz _GL_VOID_RET _GL_VOID GLsizei const void *pointer _GL_VOID_RET _GL_VOID GLsizei const GLfloat *values _GL_VOID_RET _GL_VOID GLsizei const GLushort *values _GL_VOID_RET _GL_VOID GLint param _GL_VOID_RET _GL_VOID const GLuint const GLclampf *priorities _GL_VOID_RET _GL_VOID GLdouble y _GL_VOID_RET _GL_VOID GLfloat y _GL_VOID_RET _GL_VOID GLint y _GL_VOID_RET _GL_VOID GLshort y _GL_VOID_RET _GL_VOID GLdouble GLdouble z _GL_VOID_RET _GL_VOID GLfloat GLfloat z _GL_VOID_RET _GL_VOID GLint GLint z _GL_VOID_RET _GL_VOID GLshort GLshort z _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble w _GL_VOID_RET _GL_VOID GLfloat GLfloat GLfloat w _GL_VOID_RET _GL_VOID GLint GLint GLint w _GL_VOID_RET _GL_VOID GLshort GLshort GLshort w _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble y2 _GL_VOID_RET _GL_VOID GLfloat GLfloat GLfloat y2 _GL_VOID_RET _GL_VOID GLint GLint GLint y2 _GL_VOID_RET _GL_VOID GLshort GLshort GLshort y2 _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble z _GL_VOID_RET _GL_VOID GLdouble GLdouble z _GL_VOID_RET _GL_VOID GLuint *buffer _GL_VOID_RET _GL_VOID GLdouble t _GL_VOID_RET _GL_VOID GLfloat t _GL_VOID_RET _GL_VOID GLint t _GL_VOID_RET _GL_VOID GLshort t _GL_VOID_RET _GL_VOID GLdouble GLdouble r _GL_VOID_RET _GL_VOID GLfloat GLfloat r _GL_VOID_RET _GL_VOID GLint GLint r _GL_VOID_RET _GL_VOID GLshort GLshort r _GL_VOID_RET _GL_VOID GLdouble GLdouble r
_GL_VOID GLfloat value _GL_VOID_RET _GL_VOID const GLuint GLboolean *residences _GL_BOOL_RET _GL_VOID GLsizei GLfloat GLfloat GLfloat GLfloat const GLubyte *bitmap _GL_VOID_RET _GL_VOID GLenum const void *lists _GL_VOID_RET _GL_VOID const GLdouble *equation _GL_VOID_RET _GL_VOID GLdouble GLdouble blue _GL_VOID_RET _GL_VOID GLfloat GLfloat blue _GL_VOID_RET _GL_VOID GLint GLint blue _GL_VOID_RET _GL_VOID GLshort GLshort blue _GL_VOID_RET _GL_VOID GLubyte GLubyte blue _GL_VOID_RET _GL_VOID GLuint GLuint blue _GL_VOID_RET _GL_VOID GLushort GLushort blue _GL_VOID_RET _GL_VOID GLbyte GLbyte GLbyte alpha _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble alpha _GL_VOID_RET _GL_VOID GLfloat GLfloat GLfloat alpha _GL_VOID_RET _GL_VOID GLint GLint GLint alpha _GL_VOID_RET _GL_VOID GLshort GLshort GLshort alpha _GL_VOID_RET _GL_VOID GLubyte GLubyte GLubyte alpha _GL_VOID_RET _GL_VOID GLuint GLuint GLuint alpha _GL_VOID_RET _GL_VOID GLushort GLushort GLushort alpha _GL_VOID_RET _GL_VOID GLenum mode _GL_VOID_RET _GL_VOID GLint y
_GL_VOID GLfloat value _GL_VOID_RET _GL_VOID const GLuint GLboolean *residences _GL_BOOL_RET _GL_VOID GLsizei GLfloat GLfloat GLfloat GLfloat const GLubyte *bitmap _GL_VOID_RET _GL_VOID GLenum const void *lists _GL_VOID_RET _GL_VOID const GLdouble *equation _GL_VOID_RET _GL_VOID GLdouble GLdouble blue _GL_VOID_RET _GL_VOID GLfloat GLfloat blue _GL_VOID_RET _GL_VOID GLint GLint blue _GL_VOID_RET _GL_VOID GLshort GLshort blue _GL_VOID_RET _GL_VOID GLubyte GLubyte blue _GL_VOID_RET _GL_VOID GLuint GLuint blue _GL_VOID_RET _GL_VOID GLushort GLushort blue _GL_VOID_RET _GL_VOID GLbyte GLbyte GLbyte alpha _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble alpha _GL_VOID_RET _GL_VOID GLfloat GLfloat GLfloat alpha _GL_VOID_RET _GL_VOID GLint GLint GLint alpha _GL_VOID_RET _GL_VOID GLshort GLshort GLshort alpha _GL_VOID_RET _GL_VOID GLubyte GLubyte GLubyte alpha _GL_VOID_RET _GL_VOID GLuint GLuint GLuint alpha _GL_VOID_RET _GL_VOID GLushort GLushort GLushort alpha _GL_VOID_RET _GL_VOID GLenum mode _GL_VOID_RET _GL_VOID GLint GLsizei GLsizei GLenum type _GL_VOID_RET _GL_VOID GLsizei GLenum GLenum const void *pixels _GL_VOID_RET _GL_VOID const void *pointer _GL_VOID_RET _GL_VOID GLdouble v _GL_VOID_RET _GL_VOID GLfloat v _GL_VOID_RET _GL_VOID GLint GLint i2 _GL_VOID_RET _GL_VOID GLint j _GL_VOID_RET _GL_VOID GLfloat param _GL_VOID_RET _GL_VOID GLint param _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble GLdouble GLdouble zFar _GL_VOID_RET _GL_UINT GLdouble *equation _GL_VOID_RET _GL_VOID GLenum GLint *params _GL_VOID_RET _GL_VOID GLenum GLfloat *v _GL_VOID_RET _GL_VOID GLenum GLfloat *params _GL_VOID_RET _GL_VOID GLfloat *values _GL_VOID_RET _GL_VOID GLushort *values _GL_VOID_RET _GL_VOID GLenum GLfloat *params _GL_VOID_RET _GL_VOID GLenum GLdouble *params _GL_VOID_RET _GL_VOID GLenum GLint *params _GL_VOID_RET _GL_VOID GLsizei stride
Group RGB to Bright Vector Camera Vector Combine Material Light Line Style Layer Add Ambient Diffuse Glossy Refraction Transparent Toon Principled Hair Volume Principled Light Particle Volume Image Sky Noise Wave Voronoi Brick Texture Vector Combine Vertex Separate Vector White RGB Map Separate Set Z Dilate Combine Combine Color Channel Split ID Combine Luminance Directional Alpha Distance Hue Movie Ellipse Bokeh View Corner DENOISE
static DBVT_INLINE btScalar size(const btDbvtVolume &a)
Definition: btDbvt.cpp:52
void refit(btStridingMeshInterface *triangles, const btVector3 &aabbMin, const btVector3 &aabbMax)
SIMD_FORCE_INLINE const btScalar & w() const
Return the w value.
Definition: btQuadWord.h:119
int BVHLayoutMask
Definition: bvh_params.h:39
BVHLayout bvh_layout
Definition: bvh_params.h:70
bool top_level
Definition: bvh_params.h:67
Definition: bvh/bvh.h:80
BVHParams params
Definition: bvh/bvh.h:82
virtual void mem_zero(device_memory &mem) override
Definition: device_cpu.cpp:441
bool denoising_non_local_means(device_ptr image_ptr, device_ptr guide_ptr, device_ptr variance_ptr, device_ptr out_ptr, DenoisingTask *task)
Definition: device_cpu.cpp:585
bool denoising_divide_shadow(device_ptr a_ptr, device_ptr b_ptr, device_ptr sample_variance_ptr, device_ptr sv_variance_ptr, device_ptr buffer_variance_ptr, DenoisingTask *task)
Definition: device_cpu.cpp:780
KernelFunctions< void(*)(float *, float *, int *, int)> filter_nlm_normalize_kernel
Definition: device_cpu.cpp:231
bool denoising_combine_halves(device_ptr a_ptr, device_ptr b_ptr, device_ptr mean_ptr, device_ptr variance_ptr, int r, int4 rect, DenoisingTask *task)
Definition: device_cpu.cpp:755
virtual device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int) override
Definition: device_cpu.cpp:470
bool need_texture_info
Definition: device_cpu.cpp:182
KernelFunctions< void(*)(KernelGlobals *, uint4 *, float4 *, int, int, int, int, int)> shader_kernel
Definition: device_cpu.cpp:207
virtual void mem_copy_from(device_memory &, int, int, int, int) override
Definition: device_cpu.cpp:435
virtual bool show_samples() const override
Definition: device_cpu.cpp:362
virtual void task_cancel() override
bool adaptive_sampling_filter(KernelGlobals *kg, RenderTile &tile, int sample)
Definition: device_cpu.cpp:879
KernelFunctions< void(*)(KernelGlobals *, float *, int, int, int, int, int)> bake_kernel
Definition: device_cpu.cpp:208
virtual BVHLayoutMask get_bvh_layout_mask() const override
Definition: device_cpu.cpp:367
virtual void mem_free(device_memory &mem) override
Definition: device_cpu.cpp:452
thread_spin_lock oidn_task_lock
Definition: device_cpu.cpp:191
TaskPool task_pool
Definition: device_cpu.cpp:178
virtual bool load_kernels(const DeviceRequestedFeatures &requested_features_) override
KernelFunctions< void(*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)> convert_to_half_float_kernel
Definition: device_cpu.cpp:203
void load_texture_info()
Definition: device_cpu.cpp:376
KernelGlobals thread_kernel_globals_init()
void denoise_openimagedenoise(DeviceTask &task, RenderTile &rtile)
KernelFunctions< void(*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)> convert_to_byte_kernel
Definition: device_cpu.cpp:205
KernelFunctions< void(*)(float *, float *, int *, int, int)> filter_nlm_calc_weight_kernel
Definition: device_cpu.cpp:227
virtual int get_split_task_count(DeviceTask &task) override
KernelFunctions< void(*)(int, int, float *, float *, float *, float *, int *, int)> filter_detect_outliers_kernel
Definition: device_cpu.cpp:219
KernelFunctions< void(*)(int, int, int, float *, float *, float *, int *, float *, float3 *, int *, int *, int, int, int, int, bool)> filter_nlm_construct_gramian_kernel
Definition: device_cpu.cpp:252
void tex_alloc(device_texture &mem)
Definition: device_cpu.cpp:511
void render(DeviceTask &task, RenderTile &tile, KernelGlobals *kg)
Definition: device_cpu.cpp:934
KernelFunctions< void(*)(int, int, int, int *, float *, float *, int, int *)> filter_write_feature_kernel
Definition: device_cpu.cpp:217
KernelGlobals kernel_globals
Definition: device_cpu.cpp:179
void denoise_nlm(DenoisingTask &denoising, RenderTile &tile)
bool denoising_construct_transform(DenoisingTask *task)
Definition: device_cpu.cpp:654
virtual void mem_alloc(device_memory &mem) override
Definition: device_cpu.cpp:384
KernelFunctions< void(*)(int, TileInfo *, int, int, float *, float *, float *, float *, float *, int *, int, int)> filter_divide_shadow_kernel
Definition: device_cpu.cpp:212
void tex_free(device_texture &mem)
Definition: device_cpu.cpp:532
void global_alloc(device_memory &mem)
Definition: device_cpu.cpp:489
KernelFunctions< void(*)(int, int, float *, float *, float *, float *, float *, int *, int, int, int)> filter_nlm_update_output_kernel
Definition: device_cpu.cpp:230
bool denoising_write_feature(int out_offset, device_ptr from_ptr, device_ptr buffer_ptr, DenoisingTask *task)
Definition: device_cpu.cpp:836
CPUDevice(DeviceInfo &info_, Stats &stats_, Profiler &profiler_, bool background_)
Definition: device_cpu.cpp:283
void thread_film_convert(DeviceTask &task)
virtual void * osl_memory() override
Definition: device_cpu.cpp:542
void thread_denoise(DeviceTask &task)
KernelFunctions< void(*)(int, int, float *, float *, float *, float *, int *, int)> filter_combine_halves_kernel
Definition: device_cpu.cpp:221
virtual void const_copy_to(const char *name, void *host, size_t size) override
Definition: device_cpu.cpp:475
KernelFunctions< void(*)(int, int, float *, float *, float *, float *, int *, int, int, int, float, float)> filter_nlm_calc_difference_kernel
Definition: device_cpu.cpp:225
void thread_kernel_globals_free(KernelGlobals *kg)
KernelFunctions< void(*)(int, int, int, float *, int *, float *, float3 *, int *, int)> filter_finalize_kernel
Definition: device_cpu.cpp:254
bool denoising_get_feature(int mean_offset, int variance_offset, device_ptr mean_ptr, device_ptr variance_ptr, float scale, DenoisingTask *task)
Definition: device_cpu.cpp:808
virtual void task_wait() override
KernelFunctions< void(*)(KernelGlobals *, float *, int, int, int, int, int)> path_trace_kernel
Definition: device_cpu.cpp:201
bool denoising_solve(device_ptr output_ptr, DenoisingTask *task)
Definition: device_cpu.cpp:737
device_vector< TextureInfo > texture_info
Definition: device_cpu.cpp:181
KernelFunctions< void(*)(int, TileInfo *, int, int, int, int, float *, float *, float, int *, int, int)> filter_get_feature_kernel
Definition: device_cpu.cpp:215
bool use_split_kernel
Definition: device_cpu.cpp:197
virtual void mem_copy_to(device_memory &mem) override
Definition: device_cpu.cpp:413
KernelFunctions< void(*)(KernelGlobals *, ccl_constant KernelData *, ccl_global void *, int, ccl_global char *, int, int, int, int, int, int, int, int, ccl_global int *, int, ccl_global char *, ccl_global unsigned int *, unsigned int, ccl_global float *)> data_init_kernel
Definition: device_cpu.cpp:275
bool denoising_accumulate(device_ptr color_ptr, device_ptr color_variance_ptr, device_ptr scale_ptr, int frame, DenoisingTask *task)
Definition: device_cpu.cpp:678
void thread_shader(DeviceTask &task)
bool denoising_detect_outliers(device_ptr image_ptr, device_ptr variance_ptr, device_ptr depth_ptr, device_ptr output_ptr, DenoisingTask *task)
Definition: device_cpu.cpp:856
KernelFunctions< void(*)(float *, float *, int *, int, int)> filter_nlm_blur_kernel
Definition: device_cpu.cpp:226
void adaptive_sampling_post(const RenderTile &tile, KernelGlobals *kg)
Definition: device_cpu.cpp:913
void build_bvh(BVH *bvh, Progress &progress, bool refit) override
Definition: device_cpu.cpp:551
DeviceRequestedFeatures requested_features
Definition: device_cpu.cpp:199
KernelFunctions< void(*)(float *, TileInfo *, int, int, int, float *, int *, int *, int, int, bool, int, float)> filter_construct_transform_kernel
Definition: device_cpu.cpp:235
void thread_run(DeviceTask &task)
Definition: device_cpu.cpp:573
virtual void task_add(DeviceTask &task) override
unordered_map< string, KernelFunctions< void(*)(KernelGlobals *, KernelData *)> > split_kernels
Definition: device_cpu.cpp:276
void denoise_openimagedenoise_buffer(DeviceTask &task, float *buffer, const size_t offset, const size_t stride, const size_t x, const size_t y, const size_t w, const size_t h, const float scale)
void global_free(device_memory &mem)
Definition: device_cpu.cpp:502
void thread_render(DeviceTask &task)
void(* func)(KernelGlobals *kg, KernelData *data)
virtual bool enqueue(const KernelDimensions &dim, device_memory &kernel_globals, device_memory &data)
CPUSplitKernelFunction(CPUDevice *device)
virtual uint64_t state_buffer_size(device_memory &kg, device_memory &data, size_t num_threads)
virtual int2 split_kernel_local_size()
virtual SplitKernelFunction * get_split_kernel_function(const string &kernel_name, const DeviceRequestedFeatures &)
CPUSplitKernel(CPUDevice *device)
virtual bool enqueue_split_kernel_data_init(const KernelDimensions &dim, RenderTile &rtile, int num_global_elements, device_memory &kernel_globals, device_memory &kernel_data_, device_memory &split_data, device_memory &ray_state, device_memory &queue_index, device_memory &use_queues_flag, device_memory &work_pool_wgs)
virtual int2 split_kernel_global_size(device_memory &kg, device_memory &data, DeviceTask &task)
void init_pixel(int x, int y)
Definition: coverage.cpp:72
void finalize()
Definition: coverage.cpp:36
void init_path_trace()
Definition: coverage.cpp:52
void run_denoising(RenderTile &tile)
struct DenoisingTask::RenderBuffers render_buffer
struct DenoisingTask::DeviceFunctions functions
struct DenoisingTask::DenoiseBuffers buffer
ProfilingState * profiler
bool has_half_images
Definition: device.h:79
int num
Definition: device.h:77
string id
Definition: device.h:76
int cpu_threads
Definition: device.h:89
bool has_osl
Definition: device.h:84
DenoiserTypeMask denoisers
Definition: device.h:88
bool has_nanovdb
Definition: device.h:80
bool has_profiling
Definition: device.h:86
bool has_adaptive_stop_per_sample
Definition: device.h:83
DeviceType type
Definition: device.h:74
string description
Definition: device.h:75
bool has_volume_decoupled
Definition: device.h:81
bool load_kernels(const DeviceRequestedFeatures &requested_features)
bool path_trace(DeviceTask &task, RenderTile &rtile, device_memory &kgbuffer, device_memory &kernel_data)
Definition: device.h:293
Profiler & profiler
Definition: device.h:362
Stats & stats
Definition: device.h:361
virtual void build_bvh(BVH *bvh, Progress &progress, bool refit)
Definition: device.cpp:369
DeviceInfo info
Definition: device.h:337
KernelFunctions(F kernel_default, F kernel_sse2, F kernel_sse3, F kernel_sse41, F kernel_avx, F kernel_avx2)
Definition: device_cpu.cpp:82
F operator()() const
Definition: device_cpu.cpp:142
void add_state(ProfilingState *state)
void remove_state(ProfilingState *state)
device_vector< float > buffer
Definition: buffers.h:80
double render_time
Definition: buffers.h:82
static const int SIZE
Definition: buffers.h:173
RenderTile target
Definition: buffers.h:177
RenderTile tiles[SIZE]
Definition: buffers.h:176
int4 bounds() const
Definition: buffers.h:184
static const int CENTER
Definition: buffers.h:174
int stride
Definition: buffers.h:143
int sample
Definition: buffers.h:140
@ WAS_STOLEN
Definition: buffers.h:149
@ CAN_BE_STOLEN
Definition: buffers.h:149
RenderBuffers * buffers
Definition: buffers.h:152
int num_samples
Definition: buffers.h:139
@ PATH_TRACE
Definition: buffers.h:134
device_ptr buffer
Definition: buffers.h:146
Task task
Definition: buffers.h:136
StealingState stealing_state
Definition: buffers.h:150
int offset
Definition: buffers.h:142
int start_sample
Definition: buffers.h:138
int4 bounds() const
Definition: buffers.h:156
void mem_free(size_t size)
Definition: util_stats.h:42
void mem_alloc(size_t size)
Definition: util_stats.h:36
static int num_threads()
Definition: util_task.cpp:112
T * data()
Definition: util_array.h:208
T * resize(size_t newsize)
Definition: util_array.h:150
const char * name
MemoryType type
size_t memory_elements_size(int elements)
void * host_pointer
size_t memory_size()
device_ptr device_pointer
size_t device_size
void alloc_to_device(size_t num, bool shrink_to_fit=true)
TextureInfo info
void copy_from_device()
size_t size() const
void copy_to_device()
T * resize(size_t width, size_t height=0, size_t depth=0)
int x
Definition: btConvexHull.h:149
int w
Definition: btConvexHull.h:149
int y
Definition: btConvexHull.h:149
int z
Definition: btConvexHull.h:149
#define function_bind
@ DEVICE_CPU
Definition: device.h:45
Device * device_cpu_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
static const char * logged_architecture
Definition: device_cpu.cpp:73
void device_cpu_info(vector< DeviceInfo > &devices)
#define REGISTER_KERNEL(name)
#define REGISTER_SPLIT_KERNEL(name)
#define KERNEL_FUNCTIONS(name)
Definition: device_cpu.cpp:278
string device_cpu_capabilities()
@ MEM_PIXELS
Definition: device_memory.h:41
@ MEM_GLOBAL
Definition: device_memory.h:39
@ MEM_TEXTURE
Definition: device_memory.h:40
@ MEM_DEVICE_ONLY
Definition: device_memory.h:38
@ DENOISER_NLM
Definition: device_task.h:36
@ DENOISER_OPENIMAGEDENOISE
Definition: device_task.h:38
@ DENOISER_INPUT_RGB_ALBEDO
Definition: device_task.h:47
@ DENOISER_INPUT_RGB_ALBEDO_NORMAL
Definition: device_task.h:48
void KERNEL_FUNCTION_FULL_NAME() filter_nlm_calc_difference(int dx, int dy, float *weight_image, float *variance_image, float *scale_image, float *difference_image, int *rect, int stride, int channel_offset, int frame_offset, float a, float k_2)
void KERNEL_FUNCTION_FULL_NAME() filter_nlm_update_output(int dx, int dy, float *difference_image, float *image, float *temp_image, float *out_image, float *accum_image, int *rect, int channel_offset, int stride, int f)
void KERNEL_FUNCTION_FULL_NAME() filter_write_feature(int sample, int x, int y, int *buffer_params, float *from, float *buffer, int out_offset, int *prefilter_rect)
void KERNEL_FUNCTION_FULL_NAME() filter_get_feature(int sample, TileInfo *tile_info, int m_offset, int v_offset, int x, int y, float *mean, float *variance, float scale, int *prefilter_rect, int buffer_pass_stride, int buffer_denoising_offset)
void KERNEL_FUNCTION_FULL_NAME() filter_detect_outliers(int x, int y, ccl_global float *image, ccl_global float *variance, ccl_global float *depth, ccl_global float *output, int *rect, int pass_stride)
void KERNEL_FUNCTION_FULL_NAME() filter_nlm_calc_weight(float *difference_image, float *out_image, int *rect, int stride, int f)
void KERNEL_FUNCTION_FULL_NAME() filter_finalize(int x, int y, int storage_ofs, float *buffer, int *rank, float *XtWX, float3 *XtWY, int *buffer_params, int sample)
void KERNEL_FUNCTION_FULL_NAME() filter_nlm_construct_gramian(int dx, int dy, int t, float *difference_image, float *buffer, float *transform, int *rank, float *XtWX, float3 *XtWY, int *rect, int *filter_window, int stride, int f, int pass_stride, int frame_offset, bool use_time)
void KERNEL_FUNCTION_FULL_NAME() filter_construct_transform(float *buffer, TileInfo *tiles, int x, int y, int storage_ofs, float *transform, int *rank, int *rect, int pass_stride, int frame_stride, bool use_time, int radius, float pca_threshold)
void KERNEL_FUNCTION_FULL_NAME() filter_combine_halves(int x, int y, float *mean, float *variance, float *a, float *b, int *prefilter_rect, int r)
void KERNEL_FUNCTION_FULL_NAME() filter_nlm_blur(float *difference_image, float *out_image, int *rect, int stride, int f)
void KERNEL_FUNCTION_FULL_NAME() filter_nlm_normalize(float *out_image, float *accum_image, int *rect, int stride)
void KERNEL_FUNCTION_FULL_NAME() filter_divide_shadow(int sample, TileInfo *tile_info, int x, int y, float *unfilteredA, float *unfilteredB, float *sampleV, float *sampleVV, float *bufferV, int *prefilter_rect, int buffer_pass_stride, int buffer_denoising_offset)
ccl_device_intersect bool scene_intersect(KernelGlobals *kg, const Ray *ray, const uint visibility, Intersection *isect)
void kernel_const_copy(KernelGlobals *kg, const char *name, void *host, size_t size)
Definition: kernel.cpp:67
void kernel_global_memory_copy(KernelGlobals *kg, const char *name, void *mem, size_t size)
Definition: kernel.cpp:77
CCL_NAMESPACE_BEGIN ccl_device void kernel_do_adaptive_stopping(KernelGlobals *kg, ccl_global float *buffer, int sample)
ccl_device bool kernel_do_adaptive_filter_x(KernelGlobals *kg, int y, ccl_global WorkTile *tile)
ccl_device void kernel_adaptive_post_adjust(KernelGlobals *kg, ccl_global float *buffer, float sample_multiplier)
ccl_device bool kernel_do_adaptive_filter_y(KernelGlobals *kg, int x, ccl_global WorkTile *tile)
#define kernel_data
#define ccl_constant
#define ccl_global
#define CCL_NAMESPACE_END
#define make_int4(x, y, z, w)
#define make_int2(x, y)
void KERNEL_FUNCTION_FULL_NAME() convert_to_half_float(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride)
void KERNEL_FUNCTION_FULL_NAME() path_trace(KernelGlobals *kg, float *buffer, int sample, int x, int y, int offset, int stride)
void KERNEL_FUNCTION_FULL_NAME() convert_to_byte(KernelGlobals *kg, uchar4 *rgba, float *buffer, float sample_scale, int x, int y, int offset, int stride)
void KERNEL_FUNCTION_FULL_NAME() data_init(KernelGlobals *kg, ccl_constant KernelData *data, ccl_global void *split_data_buffer, int num_elements, ccl_global char *ray_state, int start_sample, int end_sample, int sx, int sy, int sw, int sh, int offset, int stride, ccl_global int *Queue_index, int queuesize, ccl_global char *use_queues_flag, ccl_global unsigned int *work_pool_wgs, unsigned int num_samples, ccl_global float *buffer)
void KERNEL_FUNCTION_FULL_NAME() shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int filter, int i, int offset, int sample)
ccl_device_noinline_cpu float3 indirect_background(KernelGlobals *kg, ShaderData *emission_sd, ccl_addr_space PathState *state, ccl_global float *buffer, ccl_addr_space Ray *ray)
CCL_NAMESPACE_BEGIN ccl_device_inline uint64_t split_data_buffer_size(KernelGlobals *kg, size_t num_elements)
__kernel void ccl_constant KernelData ccl_global void ccl_global char ccl_global int ccl_global char ccl_global unsigned int ccl_global float * buffer
__kernel void ccl_constant KernelData ccl_global void ccl_global char ccl_global int ccl_global char * use_queues_flag
__kernel void ccl_constant KernelData ccl_global void ccl_global char ccl_global int * queue_index
__kernel void ccl_constant KernelData ccl_global void ccl_global char * ray_state
@ CRYPT_ACCURATE
Definition: kernel_types.h:405
@ BVH_LAYOUT_EMBREE
@ BVH_LAYOUT_BVH2
@ BVH_LAYOUT_MULTI_OPTIX_EMBREE
@ DENOISING_PASS_ALBEDO
Definition: kernel_types.h:411
@ DENOISING_PASS_COLOR
Definition: kernel_types.h:417
@ DENOISING_PASS_NORMAL
Definition: kernel_types.h:409
#define F
static unsigned a[3]
Definition: RandGen.cpp:92
static void sample(SocketReader *reader, int x, int y, float color[4])
Vector< CPUDevice > devices
list of all CPUDevices. for every hardware thread an instance of CPUDevice is created
struct blender::compositor::@172::@174 task
static int bake(const BakeAPIRender *bkr, Object *ob_low, const ListBase *selected_objects, ReportList *reports)
#define min(a, b)
Definition: sort.c:51
__int64 int64_t
Definition: stdint.h:92
unsigned __int64 uint64_t
Definition: stdint.h:93
bool has_avx2()
Definition: util_debug.h:55
bool has_sse41()
Definition: util_debug.h:63
bool has_sse2()
Definition: util_debug.h:71
bool has_sse3()
Definition: util_debug.h:67
bool has_avx()
Definition: util_debug.h:59
function< bool(device_ptr a_ptr, device_ptr b_ptr, device_ptr mean_ptr, device_ptr variance_ptr, int r, int4 rect)> combine_halves
function< bool(device_ptr image_ptr, device_ptr variance_ptr, device_ptr depth_ptr, device_ptr output_ptr)> detect_outliers
function< bool(int out_offset, device_ptr frop_ptr, device_ptr buffer_ptr)> write_feature
function< bool(device_ptr output_ptr)> solve
function< bool(device_ptr a_ptr, device_ptr b_ptr, device_ptr sample_variance_ptr, device_ptr sv_variance_ptr, device_ptr buffer_variance_ptr)> divide_shadow
function< bool()> construct_transform
function< bool(int mean_offset, int variance_offset, device_ptr mean_ptr, device_ptr variance_ptr, float scale)> get_feature
function< bool(device_ptr color_ptr, device_ptr color_variance_ptr, device_ptr scale_ptr, int frame)> accumulate
function< bool(device_ptr image_ptr, device_ptr guide_ptr, device_ptr variance_ptr, device_ptr out_ptr)> non_local_means
void push(TaskRunFunction &&task)
Definition: util_task.cpp:36
static bool canceled()
Definition: util_task.cpp:63
void cancel()
Definition: util_task.cpp:54
void wait_work(Summary *stats=NULL)
Definition: util_task.cpp:42
ccl_global float * buffer
float max
void util_aligned_free(void *ptr)
CCL_NAMESPACE_BEGIN void * util_aligned_malloc(size_t size, int alignment)
#define MIN_ALIGNMENT_CPU_DATA_TYPES
__forceinline bool any(const avxb &b)
Definition: util_avxb.h:218
DebugFlags & DebugFlags()
Definition: util_debug.h:205
#define VLOG(severity)
Definition: util_logging.h:50
static CCL_NAMESPACE_BEGIN bool openimagedenoise_supported()
void path_init(const string &path, const string &user_path)
Definition: util_path.cpp:338
@ PROFILING_DENOISING
@ PROFILING_DENOISING_COMBINE_HALVES
@ PROFILING_DENOISING_RECONSTRUCT
@ PROFILING_DENOISING_GET_FEATURE
@ PROFILING_DENOISING_DIVIDE_SHADOW
@ PROFILING_DENOISING_DETECT_OUTLIERS
@ PROFILING_DENOISING_CONSTRUCT_TRANSFORM
@ PROFILING_DENOISING_NON_LOCAL_MEANS
ccl_device_inline int4 rect_clip(int4 a, int4 b)
Definition: util_rect.h:38
ccl_device_inline int4 rect_expand(int4 rect, int d)
Definition: util_rect.h:32
ccl_device_inline int rect_size(int4 rect)
Definition: util_rect.h:65
#define SIMD_SET_FLUSH_TO_ZERO
Definition: util_simd.h:49
string string_human_readable_size(size_t size)
string string_human_readable_number(size_t num)
bool system_cpu_support_avx2()
string system_cpu_brand_string()
bool system_cpu_support_avx()
bool system_cpu_support_sse3()
bool system_cpu_support_sse41()
bool system_cpu_support_sse2()
std::unique_lock< std::mutex > thread_scoped_lock
Definition: util_thread.h:41
CCL_NAMESPACE_BEGIN typedef std::mutex thread_mutex
Definition: util_thread.h:40
tbb::spin_mutex thread_spin_lock
Definition: util_thread.h:68
ccl_device_inline size_t align_up(size_t offset, size_t alignment)
Definition: util_types.h:65
uint64_t device_ptr
Definition: util_types.h:62