Blender  V2.93
device_denoising.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2011-2017 Blender Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
18 
20 
22 
24  : tile_info_mem(device, "denoising tile info mem", MEM_READ_WRITE),
25  profiler(NULL),
26  storage(device),
27  buffer(device),
28  device(device)
29 {
30  radius = task.denoising.radius;
31  nlm_k_2 = powf(2.0f, lerp(-5.0f, 3.0f, task.denoising.strength));
32  if (task.denoising.relative_pca) {
33  pca_threshold = -powf(10.0f, lerp(-8.0f, 0.0f, task.denoising.feature_strength));
34  }
35  else {
36  pca_threshold = powf(10.0f, lerp(-5.0f, 3.0f, task.denoising.feature_strength));
37  }
38 
39  render_buffer.frame_stride = task.frame_stride;
40  render_buffer.pass_stride = task.pass_stride;
41  render_buffer.offset = task.pass_denoising_data;
42 
43  target_buffer.pass_stride = task.target_pass_stride;
44  target_buffer.denoising_clean_offset = task.pass_denoising_clean;
46 
47  functions.map_neighbor_tiles = function_bind(task.map_neighbor_tiles, _1, device);
48  functions.unmap_neighbor_tiles = function_bind(task.unmap_neighbor_tiles, _1, device);
49 
50  tile_info = (TileInfo *)tile_info_mem.alloc(sizeof(TileInfo) / sizeof(int));
51  tile_info->from_render = task.denoising_from_render ? 1 : 0;
52 
53  tile_info->frames[0] = 0;
54  tile_info->num_frames = min(task.denoising_frames.size() + 1, DENOISE_MAX_FRAMES);
55  for (int i = 1; i < tile_info->num_frames; i++) {
56  tile_info->frames[i] = task.denoising_frames[i - 1];
57  }
58 
59  do_prefilter = task.denoising.store_passes && task.denoising.type == DENOISER_NLM;
60  do_filter = task.denoising.use && task.denoising.type == DENOISER_NLM;
61 }
62 
64 {
65  storage.XtWX.free();
66  storage.XtWY.free();
68  storage.rank.free();
69  buffer.mem.free();
72 }
73 
75 {
76  for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
77  RenderTile &rtile = neighbors.tiles[i];
78  tile_info->offsets[i] = rtile.offset;
79  tile_info->strides[i] = rtile.stride;
80  tile_info->buffers[i] = rtile.buffer;
81  }
82  tile_info->x[0] = neighbors.tiles[3].x;
83  tile_info->x[1] = neighbors.tiles[4].x;
84  tile_info->x[2] = neighbors.tiles[5].x;
85  tile_info->x[3] = neighbors.tiles[5].x + neighbors.tiles[5].w;
86  tile_info->y[0] = neighbors.tiles[1].y;
87  tile_info->y[1] = neighbors.tiles[4].y;
88  tile_info->y[2] = neighbors.tiles[7].y;
89  tile_info->y[3] = neighbors.tiles[7].y + neighbors.tiles[7].h;
90 
91  target_buffer.offset = neighbors.target.offset;
92  target_buffer.stride = neighbors.target.stride;
93  target_buffer.ptr = neighbors.target.buffer;
94 
95  if (do_prefilter && neighbors.target.buffers) {
97  neighbors.target.buffers->params.get_denoising_prefiltered_offset();
98  }
99  else {
101  }
102 
104 }
105 
107 {
108  /* Expand filter_area by radius pixels and clamp the result to the extent of the neighboring
109  * tiles */
112  rect = rect_clip(rect,
113  make_int4(tile_info->x[0], tile_info->y[0], tile_info->x[3], tile_info->y[3]));
114 
116  buffer.passes = buffer.use_intensity ? 15 : 14;
117  buffer.width = rect.z - rect.x;
119  buffer.h = rect.w - rect.y;
120  int alignment_floats = divide_up(device->mem_sub_ptr_alignment(), sizeof(float));
121  buffer.pass_stride = align_up(buffer.stride * buffer.h, alignment_floats);
123  /* Pad the total size by four floats since the SIMD kernels might go a bit over the end. */
124  int mem_size = align_up(tile_info->num_frames * buffer.frame_stride + 4, alignment_floats);
125  buffer.mem.alloc_to_device(mem_size, false);
127 
128  /* CPUs process shifts sequentially while GPUs process them in parallel. */
129  int num_layers;
131  /* Shadowing prefiltering uses a radius of 6, so allocate at least that much. */
132  int max_radius = max(radius, 6);
133  int num_shifts = (2 * max_radius + 1) * (2 * max_radius + 1);
134  num_layers = 2 * num_shifts + 1;
135  }
136  else {
137  num_layers = 3;
138  }
139  /* Allocate two layers per shift as well as one for the weight accumulation. */
141 }
142 
144 {
145  device_ptr null_ptr = (device_ptr)0;
146 
147  device_sub_ptr unfiltered_a(buffer.mem, 0, buffer.pass_stride);
153 
154  /* Get the A/B unfiltered passes, the combined sample variance, the estimated variance of the
155  * sample variance and the buffer variance. */
156  functions.divide_shadow(*unfiltered_a, *unfiltered_b, *sample_var, *sample_var_var, *buffer_var);
157 
158  /* Smooth the (generally pretty noisy) buffer variance using the spatial information from the
159  * sample variance. */
160  nlm_state.set_parameters(6, 3, 4.0f, 1.0f, false);
161  functions.non_local_means(*buffer_var, *sample_var, *sample_var_var, *filtered_var);
162 
163  /* Reuse memory, the previous data isn't needed anymore. */
164  device_ptr filtered_a = *buffer_var, filtered_b = *sample_var;
165  /* Use the smoothed variance to filter the two shadow half images using each other for weight
166  * calculation. */
167  nlm_state.set_parameters(5, 3, 1.0f, 0.25f, false);
168  functions.non_local_means(*unfiltered_a, *unfiltered_b, *filtered_var, filtered_a);
169  functions.non_local_means(*unfiltered_b, *unfiltered_a, *filtered_var, filtered_b);
170 
171  device_ptr residual_var = *sample_var_var;
172  /* Estimate the residual variance between the two filtered halves. */
173  functions.combine_halves(filtered_a, filtered_b, null_ptr, residual_var, 2, rect);
174 
175  device_ptr final_a = *unfiltered_a, final_b = *unfiltered_b;
176  /* Use the residual variance for a second filter pass. */
177  nlm_state.set_parameters(4, 2, 1.0f, 0.5f, false);
178  functions.non_local_means(filtered_a, filtered_b, residual_var, final_a);
179  functions.non_local_means(filtered_b, filtered_a, residual_var, final_b);
180 
181  /* Combine the two double-filtered halves to a final shadow feature. */
183  functions.combine_halves(final_a, final_b, *shadow_pass, null_ptr, 0, rect);
184 }
185 
187 {
190 
191  int mean_from[] = {0, 1, 2, 12, 6, 7, 8};
192  int variance_from[] = {3, 4, 5, 13, 9, 10, 11};
193  int pass_to[] = {1, 2, 3, 0, 5, 6, 7};
194  for (int pass = 0; pass < 7; pass++) {
195  device_sub_ptr feature_pass(
196  buffer.mem, pass_to[pass] * buffer.pass_stride, buffer.pass_stride);
197  /* Get the unfiltered pass and its variance from the RenderBuffers. */
198  functions.get_feature(mean_from[pass],
199  variance_from[pass],
200  *unfiltered,
201  *variance,
202  1.0f / render_buffer.samples);
203  /* Smooth the pass and store the result in the denoising buffers. */
204  nlm_state.set_parameters(2, 2, 1.0f, 0.25f, false);
205  functions.non_local_means(*unfiltered, *unfiltered, *variance, *feature_pass);
206  }
207 }
208 
210 {
211  int mean_from[] = {20, 21, 22};
212  int variance_from[] = {23, 24, 25};
213  int mean_to[] = {8, 9, 10};
214  int variance_to[] = {11, 12, 13};
215  int num_color_passes = 3;
216 
217  device_only_memory<float> temporary_color(device, "denoising temporary color");
218  temporary_color.alloc_to_device(6 * buffer.pass_stride, false);
219 
220  for (int pass = 0; pass < num_color_passes; pass++) {
221  device_sub_ptr color_pass(temporary_color, pass * buffer.pass_stride, buffer.pass_stride);
222  device_sub_ptr color_var_pass(
223  temporary_color, (pass + 3) * buffer.pass_stride, buffer.pass_stride);
224  functions.get_feature(mean_from[pass],
225  variance_from[pass],
226  *color_pass,
227  *color_var_pass,
228  1.0f / render_buffer.samples);
229  }
230 
231  device_sub_ptr depth_pass(buffer.mem, 0, buffer.pass_stride);
232  device_sub_ptr color_var_pass(
233  buffer.mem, variance_to[0] * buffer.pass_stride, 3 * buffer.pass_stride);
234  device_sub_ptr output_pass(buffer.mem, mean_to[0] * buffer.pass_stride, 3 * buffer.pass_stride);
236  temporary_color.device_pointer, *color_var_pass, *depth_pass, *output_pass);
237 
238  if (buffer.use_intensity) {
240  nlm_state.set_parameters(radius, 4, 2.0f, nlm_k_2 * 4.0f, true);
241  functions.non_local_means(*output_pass, *output_pass, *color_var_pass, *intensity_pass);
242  }
243 }
244 
246 {
247  device_ptr null_ptr = (device_ptr)0;
248 
249  int original_offset = render_buffer.offset;
250 
251  int num_passes = buffer.use_intensity ? 15 : 14;
252  for (int i = 0; i < tile_info->num_frames; i++) {
253  for (int pass = 0; pass < num_passes; pass++) {
254  device_sub_ptr to_pass(
256  bool is_variance = (pass >= 11) && (pass <= 13);
258  pass, -1, *to_pass, null_ptr, is_variance ? (1.0f / render_buffer.samples) : 1.0f);
259  }
261  }
262 
263  render_buffer.offset = original_offset;
264 }
265 
267 {
272  int num_passes = buffer.use_intensity ? 15 : 14;
273  for (int pass = 0; pass < num_passes; pass++) {
275  int out_offset = pass + target_buffer.denoising_output_offset;
276  functions.write_feature(out_offset, *from_pass, target_buffer.ptr);
277  }
278 }
279 
281 {
284 
287 
289 }
290 
292 {
297 
300  int tile_coordinate_offset = filter_area.y * target_buffer.stride + filter_area.x;
307 
309  device_sub_ptr color_var_ptr(buffer.mem, 11 * buffer.pass_stride, 3 * buffer.pass_stride);
310  for (int f = 0; f < tile_info->num_frames; f++) {
311  device_ptr scale_ptr = 0;
312  device_sub_ptr *scale_sub_ptr = NULL;
313  if (tile_info->frames[f] != 0 && (tile_info->num_frames > 1)) {
314  scale_sub_ptr = new device_sub_ptr(buffer.mem, 14 * buffer.pass_stride, buffer.pass_stride);
315  scale_ptr = **scale_sub_ptr;
316  }
317 
318  functions.accumulate(*color_ptr, *color_var_ptr, scale_ptr, f);
319  delete scale_sub_ptr;
320  }
322 }
323 
325 {
326  RenderTileNeighbors neighbors(tile);
327  functions.map_neighbor_tiles(neighbors);
328  set_render_buffer(neighbors);
329 
331 
332  if (tile_info->from_render) {
335  prefilter_color();
336  }
337  else {
338  load_buffer();
339  }
340 
341  if (do_filter) {
343  reconstruct();
344  }
345 
346  if (do_prefilter) {
347  write_buffer();
348  }
349 
350  functions.unmap_neighbor_tiles(neighbors);
351 }
352 
void set_render_buffer(RenderTileNeighbors &neighbors)
struct DenoisingTask::Storage storage
void setup_denoising_buffer()
void run_denoising(RenderTile &tile)
struct DenoisingTask::NLMState nlm_state
struct DenoisingTask::RenderBuffers render_buffer
struct DenoisingTask::TargetBuffer target_buffer
device_vector< int > tile_info_mem
struct DenoisingTask::DeviceFunctions functions
struct DenoisingTask::DenoiseBuffers buffer
DenoisingTask(Device *device, const DeviceTask &task)
struct DenoisingTask::ReconstructionState reconstruction_state
TileInfo * tile_info
Definition: device.h:293
virtual int mem_sub_ptr_alignment()
Definition: device.h:365
static const int SIZE
Definition: buffers.h:173
RenderTile target
Definition: buffers.h:177
RenderTile tiles[SIZE]
Definition: buffers.h:176
int stride
Definition: buffers.h:143
RenderBuffers * buffers
Definition: buffers.h:152
device_ptr buffer
Definition: buffers.h:146
int offset
Definition: buffers.h:142
device_ptr device_pointer
void alloc_to_device(size_t num, bool shrink_to_fit=true)
T * alloc(size_t width, size_t height=0, size_t depth=0)
void copy_to_device()
int x
Definition: btConvexHull.h:149
int w
Definition: btConvexHull.h:149
int y
Definition: btConvexHull.h:149
int z
Definition: btConvexHull.h:149
#define function_bind
@ MEM_READ_WRITE
Definition: device_memory.h:37
@ DENOISER_NLM
Definition: device_task.h:36
#define DENOISE_MAX_FRAMES
#define XTWY_SIZE
#define TRANSFORM_SIZE
#define XTWX_SIZE
#define powf(x, y)
#define CCL_NAMESPACE_END
#define make_int4(x, y, z, w)
__kernel void ccl_constant KernelData ccl_global void ccl_global char ccl_global int ccl_global char ccl_global unsigned int ccl_global float * buffer
static float lerp(float t, float a, float b)
struct blender::compositor::@172::@174 task
#define min(a, b)
Definition: sort.c:51
device_only_memory< float > mem
device_only_memory< float > temporary_mem
function< bool(device_ptr a_ptr, device_ptr b_ptr, device_ptr mean_ptr, device_ptr variance_ptr, int r, int4 rect)> combine_halves
function< bool(device_ptr image_ptr, device_ptr variance_ptr, device_ptr depth_ptr, device_ptr output_ptr)> detect_outliers
function< void(RenderTileNeighbors &neighbors)> map_neighbor_tiles
function< bool(int out_offset, device_ptr frop_ptr, device_ptr buffer_ptr)> write_feature
function< bool(device_ptr output_ptr)> solve
function< bool(device_ptr a_ptr, device_ptr b_ptr, device_ptr sample_variance_ptr, device_ptr sv_variance_ptr, device_ptr buffer_variance_ptr)> divide_shadow
function< void(RenderTileNeighbors &neighbors)> unmap_neighbor_tiles
function< bool()> construct_transform
function< bool(int mean_offset, int variance_offset, device_ptr mean_ptr, device_ptr variance_ptr, float scale)> get_feature
function< bool(device_ptr color_ptr, device_ptr color_variance_ptr, device_ptr scale_ptr, int frame)> accumulate
function< bool(device_ptr image_ptr, device_ptr guide_ptr, device_ptr variance_ptr, device_ptr out_ptr)> non_local_means
void set_parameters(int r_, int f_, float a_, float k_2_, bool is_color_)
device_only_memory< float > XtWX
device_only_memory< float3 > XtWY
device_only_memory< float > transform
device_only_memory< int > rank
int offsets[9]
int from_render
int num_frames
int y[4]
int strides[9]
int x[4]
long long int buffers[9]
int frames[DENOISE_MAX_FRAMES]
float max
CCL_NAMESPACE_BEGIN ccl_device_inline int4 rect_from_shape(int x0, int y0, int w, int h)
Definition: util_rect.h:27
ccl_device_inline int4 rect_clip(int4 a, int4 b)
Definition: util_rect.h:38
ccl_device_inline int4 rect_expand(int4 rect, int d)
Definition: util_rect.h:32
ccl_device_inline size_t align_up(size_t offset, size_t alignment)
Definition: util_types.h:65
ccl_device_inline size_t divide_up(size_t x, size_t y)
Definition: util_types.h:70
uint64_t device_ptr
Definition: util_types.h:62