Blender  V2.93
device_split_kernel.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2011-2016 Blender Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
18 
19 #include "kernel/kernel_types.h"
21 
22 #include "util/util_logging.h"
23 #include "util/util_time.h"
24 
26 
27 static const double alpha = 0.1; /* alpha for rolling average */
28 
30  : device(device),
31  split_data(device, "split_data"),
32  ray_state(device, "ray_state", MEM_READ_WRITE),
33  queue_index(device, "queue_index"),
34  use_queues_flag(device, "use_queues_flag"),
35  work_pool_wgs(device, "work_pool_wgs"),
36  kernel_data_initialized(false)
37 {
38  avg_time_per_sample = 0.0;
39 
40  kernel_path_init = NULL;
41  kernel_scene_intersect = NULL;
42  kernel_lamp_emission = NULL;
43  kernel_do_volume = NULL;
44  kernel_queue_enqueue = NULL;
45  kernel_indirect_background = NULL;
46  kernel_shader_setup = NULL;
47  kernel_shader_sort = NULL;
48  kernel_shader_eval = NULL;
49  kernel_holdout_emission_blurring_pathtermination_ao = NULL;
50  kernel_subsurface_scatter = NULL;
51  kernel_direct_lighting = NULL;
52  kernel_shadow_blocked_ao = NULL;
53  kernel_shadow_blocked_dl = NULL;
54  kernel_enqueue_inactive = NULL;
55  kernel_next_iteration_setup = NULL;
56  kernel_indirect_subsurface = NULL;
57  kernel_buffer_update = NULL;
58  kernel_adaptive_stopping = NULL;
59  kernel_adaptive_filter_x = NULL;
60  kernel_adaptive_filter_y = NULL;
61  kernel_adaptive_adjust_samples = NULL;
62 }
63 
65 {
66  split_data.free();
67  ray_state.free();
68  use_queues_flag.free();
69  queue_index.free();
70  work_pool_wgs.free();
71 
72  delete kernel_path_init;
73  delete kernel_scene_intersect;
74  delete kernel_lamp_emission;
75  delete kernel_do_volume;
76  delete kernel_queue_enqueue;
77  delete kernel_indirect_background;
78  delete kernel_shader_setup;
79  delete kernel_shader_sort;
80  delete kernel_shader_eval;
81  delete kernel_holdout_emission_blurring_pathtermination_ao;
82  delete kernel_subsurface_scatter;
83  delete kernel_direct_lighting;
84  delete kernel_shadow_blocked_ao;
85  delete kernel_shadow_blocked_dl;
86  delete kernel_enqueue_inactive;
87  delete kernel_next_iteration_setup;
88  delete kernel_indirect_subsurface;
89  delete kernel_buffer_update;
90  delete kernel_adaptive_stopping;
91  delete kernel_adaptive_filter_x;
92  delete kernel_adaptive_filter_y;
93  delete kernel_adaptive_adjust_samples;
94 }
95 
97 {
98 #define LOAD_KERNEL(name) \
99  kernel_##name = get_split_kernel_function(#name, requested_features); \
100  if (!kernel_##name) { \
101  device->set_error(string("Split kernel error: failed to load kernel_") + #name); \
102  return false; \
103  }
104 
107  LOAD_KERNEL(lamp_emission);
108  if (requested_features.use_volume) {
109  LOAD_KERNEL(do_volume);
110  }
111  LOAD_KERNEL(queue_enqueue);
113  LOAD_KERNEL(shader_setup);
114  LOAD_KERNEL(shader_sort);
115  LOAD_KERNEL(shader_eval);
116  LOAD_KERNEL(holdout_emission_blurring_pathtermination_ao);
117  LOAD_KERNEL(subsurface_scatter);
118  LOAD_KERNEL(direct_lighting);
119  LOAD_KERNEL(shadow_blocked_ao);
120  LOAD_KERNEL(shadow_blocked_dl);
121  LOAD_KERNEL(enqueue_inactive);
122  LOAD_KERNEL(next_iteration_setup);
123  LOAD_KERNEL(indirect_subsurface);
124  LOAD_KERNEL(buffer_update);
125  LOAD_KERNEL(adaptive_stopping);
126  LOAD_KERNEL(adaptive_filter_x);
127  LOAD_KERNEL(adaptive_filter_y);
128  LOAD_KERNEL(adaptive_adjust_samples);
129 
130 #undef LOAD_KERNEL
131 
132  /* Re-initialiaze kernel-dependent data when kernels change. */
133  kernel_data_initialized = false;
134 
135  return true;
136 }
137 
140  uint64_t max_buffer_size)
141 {
142  uint64_t size_per_element = state_buffer_size(kg, data, 1024) / 1024;
143  VLOG(1) << "Split state element size: " << string_human_readable_number(size_per_element)
144  << " bytes. (" << string_human_readable_size(size_per_element) << ").";
145  return max_buffer_size / size_per_element;
146 }
147 
149  RenderTile &tile,
150  device_memory &kgbuffer,
152 {
153  if (device->have_error()) {
154  return false;
155  }
156 
157  /* Allocate all required global memory once. */
158  if (!kernel_data_initialized) {
159  kernel_data_initialized = true;
160 
161  /* Set local size */
162  int2 lsize = split_kernel_local_size();
163  local_size[0] = lsize[0];
164  local_size[1] = lsize[1];
165 
166  /* Set global size */
167  int2 gsize = split_kernel_global_size(kgbuffer, kernel_data, task);
168 
169  /* Make sure that set work size is a multiple of local
170  * work size dimensions.
171  */
172  global_size[0] = round_up(gsize[0], local_size[0]);
173  global_size[1] = round_up(gsize[1], local_size[1]);
174 
175  int num_global_elements = global_size[0] * global_size[1];
176  assert(num_global_elements % WORK_POOL_SIZE == 0);
177 
178  /* Calculate max groups */
179 
180  /* Denotes the maximum work groups possible w.r.t. current requested tile size. */
181  unsigned int work_pool_size = (device->info.type == DEVICE_CPU) ? WORK_POOL_SIZE_CPU :
183  unsigned int max_work_groups = num_global_elements / work_pool_size + 1;
184 
185  /* Allocate work_pool_wgs memory. */
186  work_pool_wgs.alloc_to_device(max_work_groups);
187  queue_index.alloc_to_device(NUM_QUEUES);
188  use_queues_flag.alloc_to_device(1);
189  split_data.alloc_to_device(state_buffer_size(kgbuffer, kernel_data, num_global_elements));
190  ray_state.alloc(num_global_elements);
191  }
192 
193  /* Number of elements in the global state buffer */
194  int num_global_elements = global_size[0] * global_size[1];
195 
196 #define ENQUEUE_SPLIT_KERNEL(name, global_size, local_size) \
197  if (device->have_error()) { \
198  return false; \
199  } \
200  if (!kernel_##name->enqueue( \
201  KernelDimensions(global_size, local_size), kgbuffer, kernel_data)) { \
202  return false; \
203  }
204 
205  tile.sample = tile.start_sample;
206 
207  /* for exponential increase between tile updates */
208  int time_multiplier = 1;
209 
210  while (tile.sample < tile.start_sample + tile.num_samples) {
211  /* to keep track of how long it takes to run a number of samples */
212  double start_time = time_dt();
213 
214  /* initial guess to start rolling average */
215  const int initial_num_samples = 1;
216  /* approx number of samples per second */
217  const int samples_per_second = (avg_time_per_sample > 0.0) ?
218  int(double(time_multiplier) / avg_time_per_sample) + 1 :
219  initial_num_samples;
220 
221  RenderTile subtile = tile;
222  subtile.start_sample = tile.sample;
223  subtile.num_samples = samples_per_second;
224 
225  if (task.adaptive_sampling.use) {
226  subtile.num_samples = task.adaptive_sampling.align_samples(subtile.start_sample,
227  subtile.num_samples);
228  }
229 
230  /* Don't go beyond requested number of samples. */
231  subtile.num_samples = min(subtile.num_samples,
232  tile.start_sample + tile.num_samples - tile.sample);
233 
234  if (device->have_error()) {
235  return false;
236  }
237 
238  /* reset state memory here as global size for data_init
239  * kernel might not be large enough to do in kernel
240  */
241  work_pool_wgs.zero_to_device();
242  split_data.zero_to_device();
243  ray_state.zero_to_device();
244 
245  if (!enqueue_split_kernel_data_init(KernelDimensions(global_size, local_size),
246  subtile,
247  num_global_elements,
248  kgbuffer,
249  kernel_data,
250  split_data,
251  ray_state,
252  queue_index,
253  use_queues_flag,
254  work_pool_wgs)) {
255  return false;
256  }
257 
258  ENQUEUE_SPLIT_KERNEL(path_init, global_size, local_size);
259 
260  bool activeRaysAvailable = true;
261  double cancel_time = DBL_MAX;
262 
263  while (activeRaysAvailable) {
264  /* Do path-iteration in host [Enqueue Path-iteration kernels. */
265  for (int PathIter = 0; PathIter < 16; PathIter++) {
266  ENQUEUE_SPLIT_KERNEL(scene_intersect, global_size, local_size);
267  ENQUEUE_SPLIT_KERNEL(lamp_emission, global_size, local_size);
268  if (kernel_do_volume) {
269  ENQUEUE_SPLIT_KERNEL(do_volume, global_size, local_size);
270  }
271  ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
272  ENQUEUE_SPLIT_KERNEL(indirect_background, global_size, local_size);
273  ENQUEUE_SPLIT_KERNEL(shader_setup, global_size, local_size);
274  ENQUEUE_SPLIT_KERNEL(shader_sort, global_size, local_size);
275  ENQUEUE_SPLIT_KERNEL(shader_eval, global_size, local_size);
277  holdout_emission_blurring_pathtermination_ao, global_size, local_size);
278  ENQUEUE_SPLIT_KERNEL(subsurface_scatter, global_size, local_size);
279  ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
280  ENQUEUE_SPLIT_KERNEL(direct_lighting, global_size, local_size);
281  ENQUEUE_SPLIT_KERNEL(shadow_blocked_ao, global_size, local_size);
282  ENQUEUE_SPLIT_KERNEL(shadow_blocked_dl, global_size, local_size);
283  ENQUEUE_SPLIT_KERNEL(enqueue_inactive, global_size, local_size);
284  ENQUEUE_SPLIT_KERNEL(next_iteration_setup, global_size, local_size);
285  ENQUEUE_SPLIT_KERNEL(indirect_subsurface, global_size, local_size);
286  ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
287  ENQUEUE_SPLIT_KERNEL(buffer_update, global_size, local_size);
288 
289  if (task.get_cancel() && cancel_time == DBL_MAX) {
290  /* Wait up to twice as many seconds for current samples to finish
291  * to avoid artifacts in render result from ending too soon.
292  */
293  cancel_time = time_dt() + 2.0 * time_multiplier;
294  }
295 
296  if (time_dt() > cancel_time) {
297  return true;
298  }
299  }
300 
301  /* Decide if we should exit path-iteration in host. */
302  ray_state.copy_from_device(0, global_size[0] * global_size[1], 1);
303 
304  activeRaysAvailable = false;
305 
306  for (int rayStateIter = 0; rayStateIter < global_size[0] * global_size[1]; ++rayStateIter) {
307  if (!IS_STATE(ray_state.data(), rayStateIter, RAY_INACTIVE)) {
308  if (IS_STATE(ray_state.data(), rayStateIter, RAY_INVALID)) {
309  /* Something went wrong, abort to avoid looping endlessly. */
310  device->set_error("Split kernel error: invalid ray state");
311  return false;
312  }
313 
314  /* Not all rays are RAY_INACTIVE. */
315  activeRaysAvailable = true;
316  break;
317  }
318  }
319 
320  if (time_dt() > cancel_time) {
321  return true;
322  }
323  }
324 
325  int filter_sample = tile.sample + subtile.num_samples - 1;
326  if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) {
327  size_t buffer_size[2];
328  buffer_size[0] = round_up(tile.w, local_size[0]);
329  buffer_size[1] = round_up(tile.h, local_size[1]);
330  kernel_adaptive_stopping->enqueue(
331  KernelDimensions(buffer_size, local_size), kgbuffer, kernel_data);
332  buffer_size[0] = round_up(tile.h, local_size[0]);
333  buffer_size[1] = round_up(1, local_size[1]);
334  kernel_adaptive_filter_x->enqueue(
335  KernelDimensions(buffer_size, local_size), kgbuffer, kernel_data);
336  buffer_size[0] = round_up(tile.w, local_size[0]);
337  buffer_size[1] = round_up(1, local_size[1]);
338  kernel_adaptive_filter_y->enqueue(
339  KernelDimensions(buffer_size, local_size), kgbuffer, kernel_data);
340  }
341 
342  double time_per_sample = ((time_dt() - start_time) / subtile.num_samples);
343 
344  if (avg_time_per_sample == 0.0) {
345  /* start rolling average */
346  avg_time_per_sample = time_per_sample;
347  }
348  else {
349  avg_time_per_sample = alpha * time_per_sample + (1.0 - alpha) * avg_time_per_sample;
350  }
351 
352 #undef ENQUEUE_SPLIT_KERNEL
353 
354  tile.sample += subtile.num_samples;
355  task.update_progress(&tile, tile.w * tile.h * subtile.num_samples);
356 
357  time_multiplier = min(time_multiplier << 1, 10);
358 
359  if (task.get_cancel()) {
360  return true;
361  }
362  }
363 
364  if (task.adaptive_sampling.use) {
365  /* Reset the start samples. */
366  RenderTile subtile = tile;
367  subtile.start_sample = tile.start_sample;
368  subtile.num_samples = tile.sample - tile.start_sample;
369  enqueue_split_kernel_data_init(KernelDimensions(global_size, local_size),
370  subtile,
371  num_global_elements,
372  kgbuffer,
373  kernel_data,
374  split_data,
375  ray_state,
376  queue_index,
377  use_queues_flag,
378  work_pool_wgs);
379  size_t buffer_size[2];
380  buffer_size[0] = round_up(tile.w, local_size[0]);
381  buffer_size[1] = round_up(tile.h, local_size[1]);
382  kernel_adaptive_adjust_samples->enqueue(
383  KernelDimensions(buffer_size, local_size), kgbuffer, kernel_data);
384  }
385 
386  return true;
387 }
388 
DeviceType type
Definition: device.h:74
virtual bool enqueue_split_kernel_data_init(const KernelDimensions &dim, RenderTile &rtile, int num_global_elements, device_memory &kernel_globals, device_memory &kernel_data_, device_memory &split_data, device_memory &ray_state, device_memory &queue_index, device_memory &use_queues_flag, device_memory &work_pool_wgs)=0
size_t max_elements_for_max_buffer_size(device_memory &kg, device_memory &data, uint64_t max_buffer_size)
virtual int2 split_kernel_local_size()=0
bool load_kernels(const DeviceRequestedFeatures &requested_features)
bool path_trace(DeviceTask &task, RenderTile &rtile, device_memory &kgbuffer, device_memory &kernel_data)
virtual uint64_t state_buffer_size(device_memory &kg, device_memory &data, size_t num_threads)=0
DeviceSplitKernel(Device *device)
virtual int2 split_kernel_global_size(device_memory &kg, device_memory &data, DeviceTask &task)=0
Definition: device.h:293
virtual void set_error(const string &error)
Definition: device.h:346
DeviceInfo info
Definition: device.h:337
bool have_error()
Definition: device.h:342
int sample
Definition: buffers.h:140
int num_samples
Definition: buffers.h:139
int start_sample
Definition: buffers.h:138
virtual bool enqueue(const KernelDimensions &dim, device_memory &kg, device_memory &data)=0
void alloc_to_device(size_t num, bool shrink_to_fit=true)
T * alloc(size_t width, size_t height=0, size_t depth=0)
void copy_from_device()
void zero_to_device()
@ DEVICE_CPU
Definition: device.h:45
@ MEM_READ_WRITE
Definition: device_memory.h:37
#define LOAD_KERNEL(name)
static CCL_NAMESPACE_BEGIN const double alpha
#define ENQUEUE_SPLIT_KERNEL(name, global_size, local_size)
ccl_device_intersect bool scene_intersect(KernelGlobals *kg, const Ray *ray, const uint visibility, Intersection *isect)
#define kernel_data
#define CCL_NAMESPACE_END
ccl_device_noinline_cpu float3 indirect_background(KernelGlobals *kg, ShaderData *emission_sd, ccl_addr_space PathState *state, ccl_global float *buffer, ccl_addr_space Ray *ray)
__kernel void ccl_constant KernelData ccl_global void ccl_global char ccl_global int ccl_global char * use_queues_flag
__kernel void ccl_constant KernelData ccl_global void ccl_global char ccl_global int * queue_index
__kernel void ccl_constant KernelData ccl_global void ccl_global char * ray_state
#define WORK_POOL_SIZE_GPU
Definition: kernel_types.h:67
#define IS_STATE(ray_state, ray_index, state)
#define WORK_POOL_SIZE
Definition: kernel_types.h:72
@ RAY_INVALID
@ RAY_INACTIVE
#define WORK_POOL_SIZE_CPU
Definition: kernel_types.h:68
@ NUM_QUEUES
struct blender::compositor::@172::@174 task
#define min(a, b)
Definition: sort.c:51
unsigned __int64 uint64_t
Definition: stdint.h:93
#define VLOG(severity)
Definition: util_logging.h:50
void path_init(const string &path, const string &user_path)
Definition: util_path.cpp:338
string string_human_readable_size(size_t size)
string string_human_readable_number(size_t num)
CCL_NAMESPACE_BEGIN double time_dt()
Definition: util_time.cpp:48
ccl_device_inline size_t round_up(size_t x, size_t multiple)
Definition: util_types.h:75