Blender  V2.93
device_optix.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2019, NVIDIA Corporation.
3  * Copyright 2019, Blender Foundation.
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 #ifdef WITH_OPTIX
19 
20 # include "bvh/bvh.h"
21 # include "bvh/bvh_optix.h"
22 # include "device/cuda/device_cuda.h"
23 # include "device/device_denoising.h"
24 # include "device/device_intern.h"
25 # include "render/buffers.h"
26 # include "render/hair.h"
27 # include "render/mesh.h"
28 # include "render/object.h"
29 # include "render/scene.h"
30 # include "util/util_debug.h"
31 # include "util/util_logging.h"
32 # include "util/util_md5.h"
33 # include "util/util_path.h"
34 # include "util/util_progress.h"
35 # include "util/util_time.h"
36 
37 # ifdef WITH_CUDA_DYNLOAD
38 # include <cuew.h>
39 // Do not use CUDA SDK headers when using CUEW
40 # define OPTIX_DONT_INCLUDE_CUDA
41 # endif
42 # include <optix_function_table_definition.h>
43 # include <optix_stubs.h>
44 
45 // TODO(pmours): Disable this once drivers have native support
46 # define OPTIX_DENOISER_NO_PIXEL_STRIDE 1
47 
49 
50 /* Make sure this stays in sync with kernel_globals.h */
51 struct ShaderParams {
52  uint4 *input;
53  float4 *output;
54  int type;
55  int filter;
56  int sx;
57  int offset;
58  int sample;
59 };
60 struct KernelParams {
61  WorkTile tile;
63  ShaderParams shader;
64 # define KERNEL_TEX(type, name) const type *name;
65 # include "kernel/kernel_textures.h"
66 # undef KERNEL_TEX
67 };
68 
69 # define check_result_cuda(stmt) \
70  { \
71  CUresult res = stmt; \
72  if (res != CUDA_SUCCESS) { \
73  const char *name; \
74  cuGetErrorName(res, &name); \
75  set_error(string_printf("%s in %s (device_optix.cpp:%d)", name, #stmt, __LINE__)); \
76  return; \
77  } \
78  } \
79  (void)0
80 # define check_result_cuda_ret(stmt) \
81  { \
82  CUresult res = stmt; \
83  if (res != CUDA_SUCCESS) { \
84  const char *name; \
85  cuGetErrorName(res, &name); \
86  set_error(string_printf("%s in %s (device_optix.cpp:%d)", name, #stmt, __LINE__)); \
87  return false; \
88  } \
89  } \
90  (void)0
91 
92 # define check_result_optix(stmt) \
93  { \
94  enum OptixResult res = stmt; \
95  if (res != OPTIX_SUCCESS) { \
96  const char *name = optixGetErrorName(res); \
97  set_error(string_printf("%s in %s (device_optix.cpp:%d)", name, #stmt, __LINE__)); \
98  return; \
99  } \
100  } \
101  (void)0
102 # define check_result_optix_ret(stmt) \
103  { \
104  enum OptixResult res = stmt; \
105  if (res != OPTIX_SUCCESS) { \
106  const char *name = optixGetErrorName(res); \
107  set_error(string_printf("%s in %s (device_optix.cpp:%d)", name, #stmt, __LINE__)); \
108  return false; \
109  } \
110  } \
111  (void)0
112 
113 # define launch_filter_kernel(func_name, w, h, args) \
114  { \
115  CUfunction func; \
116  check_result_cuda_ret(cuModuleGetFunction(&func, cuFilterModule, func_name)); \
117  check_result_cuda_ret(cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_L1)); \
118  int threads; \
119  check_result_cuda_ret( \
120  cuFuncGetAttribute(&threads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
121  threads = (int)sqrt((float)threads); \
122  int xblocks = ((w) + threads - 1) / threads; \
123  int yblocks = ((h) + threads - 1) / threads; \
124  check_result_cuda_ret( \
125  cuLaunchKernel(func, xblocks, yblocks, 1, threads, threads, 1, 0, 0, args, 0)); \
126  } \
127  (void)0
128 
129 class OptiXDevice : public CUDADevice {
130 
131  // List of OptiX program groups
132  enum {
133  PG_RGEN,
134  PG_MISS,
135  PG_HITD, // Default hit group
136  PG_HITS, // __SHADOW_RECORD_ALL__ hit group
137  PG_HITL, // __BVH_LOCAL__ hit group (only used for triangles)
138 # if OPTIX_ABI_VERSION >= 36
139  PG_HITD_MOTION,
140  PG_HITS_MOTION,
141 # endif
142  PG_BAKE, // kernel_bake_evaluate
143  PG_DISP, // kernel_displace_evaluate
144  PG_BACK, // kernel_background_evaluate
145  PG_CALL,
146  NUM_PROGRAM_GROUPS = PG_CALL + 3
147  };
148 
149  // List of OptiX pipelines
150  enum { PIP_PATH_TRACE, PIP_SHADER_EVAL, NUM_PIPELINES };
151 
152  // A single shader binding table entry
153  struct SbtRecord {
154  char header[OPTIX_SBT_RECORD_HEADER_SIZE];
155  };
156 
157  // Information stored about CUDA memory allocations
158  struct CUDAMem {
159  bool free_map_host = false;
160  CUarray array = NULL;
161  CUtexObject texobject = 0;
162  bool use_mapped_host = false;
163  };
164 
165  // Helper class to manage current CUDA context
166  struct CUDAContextScope {
167  CUDAContextScope(CUcontext ctx)
168  {
169  cuCtxPushCurrent(ctx);
170  }
171  ~CUDAContextScope()
172  {
173  cuCtxPopCurrent(NULL);
174  }
175  };
176 
177  // Use a pool with multiple threads to support launches with multiple CUDA streams
179 
180  vector<CUstream> cuda_stream;
181  OptixDeviceContext context = NULL;
182 
183  OptixModule optix_module = NULL; // All necessary OptiX kernels are in one module
184  OptixModule builtin_modules[2] = {};
185  OptixPipeline pipelines[NUM_PIPELINES] = {};
186 
187  bool motion_blur = false;
188  device_vector<SbtRecord> sbt_data;
189  device_only_memory<KernelParams> launch_params;
190  OptixTraversableHandle tlas_handle = 0;
191 
192  OptixDenoiser denoiser = NULL;
193  device_only_memory<unsigned char> denoiser_state;
194  int denoiser_input_passes = 0;
195 
196  vector<device_only_memory<char>> delayed_free_bvh_memory;
197  thread_mutex delayed_free_bvh_mutex;
198 
199  public:
200  OptiXDevice(DeviceInfo &info_, Stats &stats_, Profiler &profiler_, bool background_)
201  : CUDADevice(info_, stats_, profiler_, background_),
202  sbt_data(this, "__sbt", MEM_READ_ONLY),
203  launch_params(this, "__params", false),
204  denoiser_state(this, "__denoiser_state", true)
205  {
206  // Store number of CUDA streams in device info
207  info.cpu_threads = DebugFlags().optix.cuda_streams;
208 
209  // Make the CUDA context current
210  if (!cuContext) {
211  return; // Do not initialize if CUDA context creation failed already
212  }
213  const CUDAContextScope scope(cuContext);
214 
215  // Create OptiX context for this device
216  OptixDeviceContextOptions options = {};
217 # ifdef WITH_CYCLES_LOGGING
218  options.logCallbackLevel = 4; // Fatal = 1, Error = 2, Warning = 3, Print = 4
219  options.logCallbackFunction =
220  [](unsigned int level, const char *, const char *message, void *) {
221  switch (level) {
222  case 1:
223  LOG_IF(FATAL, VLOG_IS_ON(1)) << message;
224  break;
225  case 2:
226  LOG_IF(ERROR, VLOG_IS_ON(1)) << message;
227  break;
228  case 3:
229  LOG_IF(WARNING, VLOG_IS_ON(1)) << message;
230  break;
231  case 4:
232  LOG_IF(INFO, VLOG_IS_ON(1)) << message;
233  break;
234  }
235  };
236 # endif
237 # if OPTIX_ABI_VERSION >= 41 && defined(WITH_CYCLES_DEBUG)
238  options.validationMode = OPTIX_DEVICE_CONTEXT_VALIDATION_MODE_ALL;
239 # endif
240  check_result_optix(optixDeviceContextCreate(cuContext, &options, &context));
241 # ifdef WITH_CYCLES_LOGGING
242  check_result_optix(optixDeviceContextSetLogCallback(
243  context, options.logCallbackFunction, options.logCallbackData, options.logCallbackLevel));
244 # endif
245 
246  // Create launch streams
247  cuda_stream.resize(info.cpu_threads);
248  for (int i = 0; i < info.cpu_threads; ++i)
249  check_result_cuda(cuStreamCreate(&cuda_stream[i], CU_STREAM_NON_BLOCKING));
250 
251  // Fix weird compiler bug that assigns wrong size
252  launch_params.data_elements = sizeof(KernelParams);
253  // Allocate launch parameter buffer memory on device
254  launch_params.alloc_to_device(info.cpu_threads);
255  }
256  ~OptiXDevice()
257  {
258  // Stop processing any more tasks
259  task_pool.cancel();
260 
261  // Make CUDA context current
262  const CUDAContextScope scope(cuContext);
263 
264  free_bvh_memory_delayed();
265 
266  sbt_data.free();
267  texture_info.free();
268  launch_params.free();
269  denoiser_state.free();
270 
271  // Unload modules
272  if (optix_module != NULL)
273  optixModuleDestroy(optix_module);
274  for (unsigned int i = 0; i < 2; ++i)
275  if (builtin_modules[i] != NULL)
276  optixModuleDestroy(builtin_modules[i]);
277  for (unsigned int i = 0; i < NUM_PIPELINES; ++i)
278  if (pipelines[i] != NULL)
279  optixPipelineDestroy(pipelines[i]);
280 
281  // Destroy launch streams
282  for (CUstream stream : cuda_stream)
283  cuStreamDestroy(stream);
284 
285  if (denoiser != NULL)
286  optixDenoiserDestroy(denoiser);
287 
288  optixDeviceContextDestroy(context);
289  }
290 
291  private:
292  bool show_samples() const override
293  {
294  // Only show samples if not rendering multiple tiles in parallel
295  return info.cpu_threads == 1;
296  }
297 
298  BVHLayoutMask get_bvh_layout_mask() const override
299  {
300  // CUDA kernels are used when doing baking, so need to build a BVH those can understand too!
301  if (optix_module == NULL)
302  return CUDADevice::get_bvh_layout_mask();
303 
304  // OptiX has its own internal acceleration structure format
305  return BVH_LAYOUT_OPTIX;
306  }
307 
308  string compile_kernel_get_common_cflags(const DeviceRequestedFeatures &requested_features,
309  bool filter,
310  bool /*split*/) override
311  {
312  // Split kernel is not supported in OptiX
313  string common_cflags = CUDADevice::compile_kernel_get_common_cflags(
314  requested_features, filter, false);
315 
316  // Add OptiX SDK include directory to include paths
317  const char *optix_sdk_path = getenv("OPTIX_ROOT_DIR");
318  if (optix_sdk_path) {
319  common_cflags += string_printf(" -I\"%s/include\"", optix_sdk_path);
320  }
321 
322  // Specialization for shader raytracing
323  if (requested_features.use_shader_raytrace) {
324  common_cflags += " --keep-device-functions";
325  }
326  else {
327  common_cflags += " -D __NO_SHADER_RAYTRACE__";
328  }
329 
330  return common_cflags;
331  }
332 
333  bool load_kernels(const DeviceRequestedFeatures &requested_features) override
334  {
335  if (have_error()) {
336  // Abort early if context creation failed already
337  return false;
338  }
339 
340  // Load CUDA modules because we need some of the utility kernels
341  if (!CUDADevice::load_kernels(requested_features)) {
342  return false;
343  }
344 
345  // Baking is currently performed using CUDA, so no need to load OptiX kernels
346  if (requested_features.use_baking) {
347  return true;
348  }
349 
350  const CUDAContextScope scope(cuContext);
351 
352  // Unload existing OptiX module and pipelines first
353  if (optix_module != NULL) {
354  optixModuleDestroy(optix_module);
355  optix_module = NULL;
356  }
357  for (unsigned int i = 0; i < 2; ++i) {
358  if (builtin_modules[i] != NULL) {
359  optixModuleDestroy(builtin_modules[i]);
360  builtin_modules[i] = NULL;
361  }
362  }
363  for (unsigned int i = 0; i < NUM_PIPELINES; ++i) {
364  if (pipelines[i] != NULL) {
365  optixPipelineDestroy(pipelines[i]);
366  pipelines[i] = NULL;
367  }
368  }
369 
370  OptixModuleCompileOptions module_options = {};
371  module_options.maxRegisterCount = 0; // Do not set an explicit register limit
372 # ifdef WITH_CYCLES_DEBUG
373  module_options.optLevel = OPTIX_COMPILE_OPTIMIZATION_LEVEL_0;
374  module_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_FULL;
375 # else
376  module_options.optLevel = OPTIX_COMPILE_OPTIMIZATION_LEVEL_3;
377  module_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO;
378 # endif
379 
380 # if OPTIX_ABI_VERSION >= 41
381  module_options.boundValues = nullptr;
382  module_options.numBoundValues = 0;
383 # endif
384 
385  OptixPipelineCompileOptions pipeline_options = {};
386  // Default to no motion blur and two-level graph, since it is the fastest option
387  pipeline_options.usesMotionBlur = false;
388  pipeline_options.traversableGraphFlags =
389  OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_SINGLE_LEVEL_INSTANCING;
390  pipeline_options.numPayloadValues = 6;
391  pipeline_options.numAttributeValues = 2; // u, v
392  pipeline_options.exceptionFlags = OPTIX_EXCEPTION_FLAG_NONE;
393  pipeline_options.pipelineLaunchParamsVariableName = "__params"; // See kernel_globals.h
394 
395 # if OPTIX_ABI_VERSION >= 36
396  pipeline_options.usesPrimitiveTypeFlags = OPTIX_PRIMITIVE_TYPE_FLAGS_TRIANGLE;
397  if (requested_features.use_hair) {
398  if (DebugFlags().optix.curves_api && requested_features.use_hair_thick) {
399  pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_CUBIC_BSPLINE;
400  }
401  else {
402  pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_CUSTOM;
403  }
404  }
405 # endif
406 
407  // Keep track of whether motion blur is enabled, so to enable/disable motion in BVH builds
408  // This is necessary since objects may be reported to have motion if the Vector pass is
409  // active, but may still need to be rendered without motion blur if that isn't active as well
410  motion_blur = requested_features.use_object_motion;
411 
412  if (motion_blur) {
413  pipeline_options.usesMotionBlur = true;
414  // Motion blur can insert motion transforms into the traversal graph
415  // It is no longer a two-level graph then, so need to set flags to allow any configuration
416  pipeline_options.traversableGraphFlags = OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_ANY;
417  }
418 
419  { // Load and compile PTX module with OptiX kernels
420  string ptx_data, ptx_filename = path_get(requested_features.use_shader_raytrace ?
421  "lib/kernel_optix_shader_raytrace.ptx" :
422  "lib/kernel_optix.ptx");
423  if (use_adaptive_compilation() || path_file_size(ptx_filename) == -1) {
424  if (!getenv("OPTIX_ROOT_DIR")) {
425  set_error(
426  "Missing OPTIX_ROOT_DIR environment variable (which must be set with the path to "
427  "the Optix SDK to be able to compile Optix kernels on demand).");
428  return false;
429  }
430  ptx_filename = compile_kernel(requested_features, "kernel_optix", "optix", true);
431  }
432  if (ptx_filename.empty() || !path_read_text(ptx_filename, ptx_data)) {
433  set_error("Failed to load OptiX kernel from '" + ptx_filename + "'");
434  return false;
435  }
436 
437  check_result_optix_ret(optixModuleCreateFromPTX(context,
438  &module_options,
439  &pipeline_options,
440  ptx_data.data(),
441  ptx_data.size(),
442  nullptr,
443  0,
444  &optix_module));
445  }
446 
447  // Create program groups
448  OptixProgramGroup groups[NUM_PROGRAM_GROUPS] = {};
449  OptixProgramGroupDesc group_descs[NUM_PROGRAM_GROUPS] = {};
450  OptixProgramGroupOptions group_options = {}; // There are no options currently
451  group_descs[PG_RGEN].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
452  group_descs[PG_RGEN].raygen.module = optix_module;
453  // Ignore branched integrator for now (see "requested_features.use_integrator_branched")
454  group_descs[PG_RGEN].raygen.entryFunctionName = "__raygen__kernel_optix_path_trace";
455  group_descs[PG_MISS].kind = OPTIX_PROGRAM_GROUP_KIND_MISS;
456  group_descs[PG_MISS].miss.module = optix_module;
457  group_descs[PG_MISS].miss.entryFunctionName = "__miss__kernel_optix_miss";
458  group_descs[PG_HITD].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
459  group_descs[PG_HITD].hitgroup.moduleCH = optix_module;
460  group_descs[PG_HITD].hitgroup.entryFunctionNameCH = "__closesthit__kernel_optix_hit";
461  group_descs[PG_HITD].hitgroup.moduleAH = optix_module;
462  group_descs[PG_HITD].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_visibility_test";
463  group_descs[PG_HITS].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
464  group_descs[PG_HITS].hitgroup.moduleAH = optix_module;
465  group_descs[PG_HITS].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_shadow_all_hit";
466 
467  if (requested_features.use_hair) {
468  group_descs[PG_HITD].hitgroup.moduleIS = optix_module;
469  group_descs[PG_HITS].hitgroup.moduleIS = optix_module;
470 
471  // Add curve intersection programs
472  if (requested_features.use_hair_thick) {
473  // Slower programs for thick hair since that also slows down ribbons.
474  // Ideally this should not be needed.
475  group_descs[PG_HITD].hitgroup.entryFunctionNameIS = "__intersection__curve_all";
476  group_descs[PG_HITS].hitgroup.entryFunctionNameIS = "__intersection__curve_all";
477  }
478  else {
479  group_descs[PG_HITD].hitgroup.entryFunctionNameIS = "__intersection__curve_ribbon";
480  group_descs[PG_HITS].hitgroup.entryFunctionNameIS = "__intersection__curve_ribbon";
481  }
482 
483 # if OPTIX_ABI_VERSION >= 36
484  if (DebugFlags().optix.curves_api && requested_features.use_hair_thick) {
485  OptixBuiltinISOptions builtin_options = {};
486  builtin_options.builtinISModuleType = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE;
487  builtin_options.usesMotionBlur = false;
488 
489  check_result_optix_ret(optixBuiltinISModuleGet(
490  context, &module_options, &pipeline_options, &builtin_options, &builtin_modules[0]));
491 
492  group_descs[PG_HITD].hitgroup.moduleIS = builtin_modules[0];
493  group_descs[PG_HITD].hitgroup.entryFunctionNameIS = nullptr;
494  group_descs[PG_HITS].hitgroup.moduleIS = builtin_modules[0];
495  group_descs[PG_HITS].hitgroup.entryFunctionNameIS = nullptr;
496 
497  if (motion_blur) {
498  builtin_options.usesMotionBlur = true;
499 
500  check_result_optix_ret(optixBuiltinISModuleGet(
501  context, &module_options, &pipeline_options, &builtin_options, &builtin_modules[1]));
502 
503  group_descs[PG_HITD_MOTION] = group_descs[PG_HITD];
504  group_descs[PG_HITD_MOTION].hitgroup.moduleIS = builtin_modules[1];
505  group_descs[PG_HITS_MOTION] = group_descs[PG_HITS];
506  group_descs[PG_HITS_MOTION].hitgroup.moduleIS = builtin_modules[1];
507  }
508  }
509 # endif
510  }
511 
512  if (requested_features.use_subsurface || requested_features.use_shader_raytrace) {
513  // Add hit group for local intersections
514  group_descs[PG_HITL].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
515  group_descs[PG_HITL].hitgroup.moduleAH = optix_module;
516  group_descs[PG_HITL].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_local_hit";
517  }
518 
519  if (requested_features.use_baking) {
520  group_descs[PG_BAKE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
521  group_descs[PG_BAKE].raygen.module = optix_module;
522  group_descs[PG_BAKE].raygen.entryFunctionName = "__raygen__kernel_optix_bake";
523  }
524 
525  if (requested_features.use_true_displacement) {
526  group_descs[PG_DISP].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
527  group_descs[PG_DISP].raygen.module = optix_module;
528  group_descs[PG_DISP].raygen.entryFunctionName = "__raygen__kernel_optix_displace";
529  }
530 
531  if (requested_features.use_background_light) {
532  group_descs[PG_BACK].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
533  group_descs[PG_BACK].raygen.module = optix_module;
534  group_descs[PG_BACK].raygen.entryFunctionName = "__raygen__kernel_optix_background";
535  }
536 
537  // Shader raytracing replaces some functions with direct callables
538  if (requested_features.use_shader_raytrace) {
539  group_descs[PG_CALL + 0].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
540  group_descs[PG_CALL + 0].callables.moduleDC = optix_module;
541  group_descs[PG_CALL + 0].callables.entryFunctionNameDC = "__direct_callable__svm_eval_nodes";
542  group_descs[PG_CALL + 1].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
543  group_descs[PG_CALL + 1].callables.moduleDC = optix_module;
544  group_descs[PG_CALL + 1].callables.entryFunctionNameDC =
545  "__direct_callable__kernel_volume_shadow";
546  group_descs[PG_CALL + 2].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
547  group_descs[PG_CALL + 2].callables.moduleDC = optix_module;
548  group_descs[PG_CALL + 2].callables.entryFunctionNameDC =
549  "__direct_callable__subsurface_scatter_multi_setup";
550  }
551 
552  check_result_optix_ret(optixProgramGroupCreate(
553  context, group_descs, NUM_PROGRAM_GROUPS, &group_options, nullptr, 0, groups));
554 
555  // Get program stack sizes
556  OptixStackSizes stack_size[NUM_PROGRAM_GROUPS] = {};
557  // Set up SBT, which in this case is used only to select between different programs
558  sbt_data.alloc(NUM_PROGRAM_GROUPS);
559  memset(sbt_data.host_pointer, 0, sizeof(SbtRecord) * NUM_PROGRAM_GROUPS);
560  for (unsigned int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
561  check_result_optix_ret(optixSbtRecordPackHeader(groups[i], &sbt_data[i]));
562  check_result_optix_ret(optixProgramGroupGetStackSize(groups[i], &stack_size[i]));
563  }
564  sbt_data.copy_to_device(); // Upload SBT to device
565 
566  // Calculate maximum trace continuation stack size
567  unsigned int trace_css = stack_size[PG_HITD].cssCH;
568  // This is based on the maximum of closest-hit and any-hit/intersection programs
569  trace_css = std::max(trace_css, stack_size[PG_HITD].cssIS + stack_size[PG_HITD].cssAH);
570  trace_css = std::max(trace_css, stack_size[PG_HITS].cssIS + stack_size[PG_HITS].cssAH);
571  trace_css = std::max(trace_css, stack_size[PG_HITL].cssIS + stack_size[PG_HITL].cssAH);
572 # if OPTIX_ABI_VERSION >= 36
573  trace_css = std::max(trace_css,
574  stack_size[PG_HITD_MOTION].cssIS + stack_size[PG_HITD_MOTION].cssAH);
575  trace_css = std::max(trace_css,
576  stack_size[PG_HITS_MOTION].cssIS + stack_size[PG_HITS_MOTION].cssAH);
577 # endif
578 
579  OptixPipelineLinkOptions link_options = {};
580  link_options.maxTraceDepth = 1;
581 # ifdef WITH_CYCLES_DEBUG
582  link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_FULL;
583 # else
584  link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_LINEINFO;
585 # endif
586 # if OPTIX_ABI_VERSION < 24
587  link_options.overrideUsesMotionBlur = motion_blur;
588 # endif
589 
590  { // Create path tracing pipeline
591  vector<OptixProgramGroup> pipeline_groups;
592  pipeline_groups.reserve(NUM_PROGRAM_GROUPS);
593  pipeline_groups.push_back(groups[PG_RGEN]);
594  pipeline_groups.push_back(groups[PG_MISS]);
595  pipeline_groups.push_back(groups[PG_HITD]);
596  pipeline_groups.push_back(groups[PG_HITS]);
597  pipeline_groups.push_back(groups[PG_HITL]);
598 # if OPTIX_ABI_VERSION >= 36
599  if (motion_blur) {
600  pipeline_groups.push_back(groups[PG_HITD_MOTION]);
601  pipeline_groups.push_back(groups[PG_HITS_MOTION]);
602  }
603 # endif
604  if (requested_features.use_shader_raytrace) {
605  pipeline_groups.push_back(groups[PG_CALL + 0]);
606  pipeline_groups.push_back(groups[PG_CALL + 1]);
607  pipeline_groups.push_back(groups[PG_CALL + 2]);
608  }
609 
610  check_result_optix_ret(optixPipelineCreate(context,
611  &pipeline_options,
612  &link_options,
613  pipeline_groups.data(),
614  pipeline_groups.size(),
615  nullptr,
616  0,
617  &pipelines[PIP_PATH_TRACE]));
618 
619  // Combine ray generation and trace continuation stack size
620  const unsigned int css = stack_size[PG_RGEN].cssRG + link_options.maxTraceDepth * trace_css;
621  // Max direct callable depth is one of the following, so combine accordingly
622  // - __raygen__ -> svm_eval_nodes
623  // - __raygen__ -> kernel_volume_shadow -> svm_eval_nodes
624  // - __raygen__ -> subsurface_scatter_multi_setup -> svm_eval_nodes
625  const unsigned int dss = stack_size[PG_CALL + 0].dssDC +
626  std::max(stack_size[PG_CALL + 1].dssDC,
627  stack_size[PG_CALL + 2].dssDC);
628 
629  // Set stack size depending on pipeline options
630  check_result_optix_ret(
631  optixPipelineSetStackSize(pipelines[PIP_PATH_TRACE],
632  0,
633  requested_features.use_shader_raytrace ? dss : 0,
634  css,
635  motion_blur ? 3 : 2));
636  }
637 
638  // Only need to create shader evaluation pipeline if one of these features is used:
639  const bool use_shader_eval_pipeline = requested_features.use_baking ||
640  requested_features.use_background_light ||
641  requested_features.use_true_displacement;
642 
643  if (use_shader_eval_pipeline) { // Create shader evaluation pipeline
644  vector<OptixProgramGroup> pipeline_groups;
645  pipeline_groups.reserve(NUM_PROGRAM_GROUPS);
646  pipeline_groups.push_back(groups[PG_BAKE]);
647  pipeline_groups.push_back(groups[PG_DISP]);
648  pipeline_groups.push_back(groups[PG_BACK]);
649  pipeline_groups.push_back(groups[PG_MISS]);
650  pipeline_groups.push_back(groups[PG_HITD]);
651  pipeline_groups.push_back(groups[PG_HITS]);
652  pipeline_groups.push_back(groups[PG_HITL]);
653 # if OPTIX_ABI_VERSION >= 36
654  if (motion_blur) {
655  pipeline_groups.push_back(groups[PG_HITD_MOTION]);
656  pipeline_groups.push_back(groups[PG_HITS_MOTION]);
657  }
658 # endif
659  if (requested_features.use_shader_raytrace) {
660  pipeline_groups.push_back(groups[PG_CALL + 0]);
661  pipeline_groups.push_back(groups[PG_CALL + 1]);
662  pipeline_groups.push_back(groups[PG_CALL + 2]);
663  }
664 
665  check_result_optix_ret(optixPipelineCreate(context,
666  &pipeline_options,
667  &link_options,
668  pipeline_groups.data(),
669  pipeline_groups.size(),
670  nullptr,
671  0,
672  &pipelines[PIP_SHADER_EVAL]));
673 
674  // Calculate continuation stack size based on the maximum of all ray generation stack sizes
675  const unsigned int css = std::max(stack_size[PG_BAKE].cssRG,
676  std::max(stack_size[PG_DISP].cssRG,
677  stack_size[PG_BACK].cssRG)) +
678  link_options.maxTraceDepth * trace_css;
679  const unsigned int dss = stack_size[PG_CALL + 0].dssDC +
680  std::max(stack_size[PG_CALL + 1].dssDC,
681  stack_size[PG_CALL + 2].dssDC);
682 
683  check_result_optix_ret(
684  optixPipelineSetStackSize(pipelines[PIP_SHADER_EVAL],
685  0,
686  requested_features.use_shader_raytrace ? dss : 0,
687  css,
688  motion_blur ? 3 : 2));
689  }
690 
691  // Clean up program group objects
692  for (unsigned int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
693  optixProgramGroupDestroy(groups[i]);
694  }
695 
696  return true;
697  }
698 
699  void thread_run(DeviceTask &task, int thread_index) // Main task entry point
700  {
701  if (have_error())
702  return; // Abort early if there was an error previously
703 
704  if (task.type == DeviceTask::RENDER) {
705  if (thread_index != 0) {
706  // Only execute denoising in a single thread (see also 'task_add')
707  task.tile_types &= ~RenderTile::DENOISE;
708  }
709 
710  RenderTile tile;
711  while (task.acquire_tile(this, tile, task.tile_types)) {
712  if (tile.task == RenderTile::PATH_TRACE)
713  launch_render(task, tile, thread_index);
714  else if (tile.task == RenderTile::BAKE) {
715  // Perform baking using CUDA, since it is not currently implemented in OptiX
716  device_vector<WorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY);
717  CUDADevice::render(task, tile, work_tiles);
718  }
719  else if (tile.task == RenderTile::DENOISE)
720  launch_denoise(task, tile);
721  task.release_tile(tile);
722  if (task.get_cancel() && !task.need_finish_queue)
723  break; // User requested cancellation
724  else if (have_error())
725  break; // Abort rendering when encountering an error
726  }
727  }
728  else if (task.type == DeviceTask::SHADER) {
729  launch_shader_eval(task, thread_index);
730  }
731  else if (task.type == DeviceTask::DENOISE_BUFFER) {
732  // Set up a single tile that covers the whole task and denoise it
733  RenderTile tile;
734  tile.x = task.x;
735  tile.y = task.y;
736  tile.w = task.w;
737  tile.h = task.h;
738  tile.buffer = task.buffer;
739  tile.num_samples = task.num_samples;
740  tile.start_sample = task.sample;
741  tile.offset = task.offset;
742  tile.stride = task.stride;
743  tile.buffers = task.buffers;
744 
745  launch_denoise(task, tile);
746  }
747  }
748 
749  void launch_render(DeviceTask &task, RenderTile &rtile, int thread_index)
750  {
751  assert(thread_index < launch_params.data_size);
752 
753  // Keep track of total render time of this tile
754  const scoped_timer timer(&rtile.buffers->render_time);
755 
756  WorkTile wtile;
757  wtile.x = rtile.x;
758  wtile.y = rtile.y;
759  wtile.w = rtile.w;
760  wtile.h = rtile.h;
761  wtile.offset = rtile.offset;
762  wtile.stride = rtile.stride;
763  wtile.buffer = (float *)rtile.buffer;
764 
765  const int end_sample = rtile.start_sample + rtile.num_samples;
766  // Keep this number reasonable to avoid running into TDRs
767  int step_samples = (info.display_device ? 8 : 32);
768 
769  // Offset into launch params buffer so that streams use separate data
770  device_ptr launch_params_ptr = launch_params.device_pointer +
771  thread_index * launch_params.data_elements;
772 
773  const CUDAContextScope scope(cuContext);
774 
775  for (int sample = rtile.start_sample; sample < end_sample;) {
776  // Copy work tile information to device
777  wtile.start_sample = sample;
778  wtile.num_samples = step_samples;
779  if (task.adaptive_sampling.use) {
780  wtile.num_samples = task.adaptive_sampling.align_samples(sample, step_samples);
781  }
782  wtile.num_samples = min(wtile.num_samples, end_sample - sample);
783  device_ptr d_wtile_ptr = launch_params_ptr + offsetof(KernelParams, tile);
784  check_result_cuda(
785  cuMemcpyHtoDAsync(d_wtile_ptr, &wtile, sizeof(wtile), cuda_stream[thread_index]));
786 
787  OptixShaderBindingTable sbt_params = {};
788  sbt_params.raygenRecord = sbt_data.device_pointer + PG_RGEN * sizeof(SbtRecord);
789  sbt_params.missRecordBase = sbt_data.device_pointer + PG_MISS * sizeof(SbtRecord);
790  sbt_params.missRecordStrideInBytes = sizeof(SbtRecord);
791  sbt_params.missRecordCount = 1;
792  sbt_params.hitgroupRecordBase = sbt_data.device_pointer + PG_HITD * sizeof(SbtRecord);
793  sbt_params.hitgroupRecordStrideInBytes = sizeof(SbtRecord);
794 # if OPTIX_ABI_VERSION >= 36
795  sbt_params.hitgroupRecordCount = 5; // PG_HITD(_MOTION), PG_HITS(_MOTION), PG_HITL
796 # else
797  sbt_params.hitgroupRecordCount = 3; // PG_HITD, PG_HITS, PG_HITL
798 # endif
799  sbt_params.callablesRecordBase = sbt_data.device_pointer + PG_CALL * sizeof(SbtRecord);
800  sbt_params.callablesRecordCount = 3;
801  sbt_params.callablesRecordStrideInBytes = sizeof(SbtRecord);
802 
803  // Launch the ray generation program
804  check_result_optix(optixLaunch(pipelines[PIP_PATH_TRACE],
805  cuda_stream[thread_index],
806  launch_params_ptr,
807  launch_params.data_elements,
808  &sbt_params,
809  // Launch with samples close to each other for better locality
810  wtile.w * wtile.num_samples,
811  wtile.h,
812  1));
813 
814  // Run the adaptive sampling kernels at selected samples aligned to step samples.
815  uint filter_sample = wtile.start_sample + wtile.num_samples - 1;
816  if (task.adaptive_sampling.use && task.adaptive_sampling.need_filter(filter_sample)) {
817  adaptive_sampling_filter(filter_sample, &wtile, d_wtile_ptr, cuda_stream[thread_index]);
818  }
819 
820  // Wait for launch to finish
821  check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index]));
822 
823  // Update current sample, so it is displayed correctly
824  sample += wtile.num_samples;
825  rtile.sample = sample;
826  // Update task progress after the kernel completed rendering
827  task.update_progress(&rtile, wtile.w * wtile.h * wtile.num_samples);
828 
829  if (task.get_cancel() && !task.need_finish_queue)
830  return; // Cancel rendering
831  }
832 
833  // Finalize adaptive sampling
834  if (task.adaptive_sampling.use) {
835  device_ptr d_wtile_ptr = launch_params_ptr + offsetof(KernelParams, tile);
836  adaptive_sampling_post(rtile, &wtile, d_wtile_ptr, cuda_stream[thread_index]);
837  check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index]));
838  task.update_progress(&rtile, rtile.w * rtile.h * wtile.num_samples);
839  }
840  }
841 
842  bool launch_denoise(DeviceTask &task, RenderTile &rtile)
843  {
844  // Update current sample (for display and NLM denoising task)
845  rtile.sample = rtile.start_sample + rtile.num_samples;
846 
847  // Make CUDA context current now, since it is used for both denoising tasks
848  const CUDAContextScope scope(cuContext);
849 
850  // Choose between OptiX and NLM denoising
851  if (task.denoising.type == DENOISER_OPTIX) {
852  // Map neighboring tiles onto this device, indices are as following:
853  // Where index 4 is the center tile and index 9 is the target for the result.
854  // 0 1 2
855  // 3 4 5
856  // 6 7 8 9
857  RenderTileNeighbors neighbors(rtile);
858  task.map_neighbor_tiles(neighbors, this);
859  RenderTile &center_tile = neighbors.tiles[RenderTileNeighbors::CENTER];
860  RenderTile &target_tile = neighbors.target;
861  rtile = center_tile; // Tile may have been modified by mapping code
862 
863  // Calculate size of the tile to denoise (including overlap)
864  int4 rect = center_tile.bounds();
865  // Overlap between tiles has to be at least 64 pixels
866  // TODO(pmours): Query this value from OptiX
867  rect = rect_expand(rect, 64);
868  int4 clip_rect = neighbors.bounds();
869  rect = rect_clip(rect, clip_rect);
870  int2 rect_size = make_int2(rect.z - rect.x, rect.w - rect.y);
871  int2 overlap_offset = make_int2(rtile.x - rect.x, rtile.y - rect.y);
872 
873  // Calculate byte offsets and strides
874  int pixel_stride = task.pass_stride * (int)sizeof(float);
875  int pixel_offset = (rtile.offset + rtile.x + rtile.y * rtile.stride) * pixel_stride;
876  const int pass_offset[3] = {
877  (task.pass_denoising_data + DENOISING_PASS_COLOR) * (int)sizeof(float),
878  (task.pass_denoising_data + DENOISING_PASS_ALBEDO) * (int)sizeof(float),
879  (task.pass_denoising_data + DENOISING_PASS_NORMAL) * (int)sizeof(float)};
880 
881  // Start with the current tile pointer offset
882  int input_stride = pixel_stride;
883  device_ptr input_ptr = rtile.buffer + pixel_offset;
884 
885  // Copy tile data into a common buffer if necessary
886  device_only_memory<float> input(this, "denoiser input", true);
887  device_vector<TileInfo> tile_info_mem(this, "denoiser tile info", MEM_READ_ONLY);
888 
889  bool contiguous_memory = true;
890  for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
891  if (neighbors.tiles[i].buffer && neighbors.tiles[i].buffer != rtile.buffer) {
892  contiguous_memory = false;
893  }
894  }
895 
896  if (contiguous_memory) {
897  // Tiles are in continous memory, so can just subtract overlap offset
898  input_ptr -= (overlap_offset.x + overlap_offset.y * rtile.stride) * pixel_stride;
899  // Stride covers the whole width of the image and not just a single tile
900  input_stride *= rtile.stride;
901  }
902  else {
903  // Adjacent tiles are in separate memory regions, so need to copy them into a single one
904  input.alloc_to_device(rect_size.x * rect_size.y * task.pass_stride);
905  // Start with the new input buffer
906  input_ptr = input.device_pointer;
907  // Stride covers the width of the new input buffer, which includes tile width and overlap
908  input_stride *= rect_size.x;
909 
910  TileInfo *tile_info = tile_info_mem.alloc(1);
911  for (int i = 0; i < RenderTileNeighbors::SIZE; i++) {
912  tile_info->offsets[i] = neighbors.tiles[i].offset;
913  tile_info->strides[i] = neighbors.tiles[i].stride;
914  tile_info->buffers[i] = neighbors.tiles[i].buffer;
915  }
916  tile_info->x[0] = neighbors.tiles[3].x;
917  tile_info->x[1] = neighbors.tiles[4].x;
918  tile_info->x[2] = neighbors.tiles[5].x;
919  tile_info->x[3] = neighbors.tiles[5].x + neighbors.tiles[5].w;
920  tile_info->y[0] = neighbors.tiles[1].y;
921  tile_info->y[1] = neighbors.tiles[4].y;
922  tile_info->y[2] = neighbors.tiles[7].y;
923  tile_info->y[3] = neighbors.tiles[7].y + neighbors.tiles[7].h;
924  tile_info_mem.copy_to_device();
925 
926  void *args[] = {
927  &input.device_pointer, &tile_info_mem.device_pointer, &rect.x, &task.pass_stride};
928  launch_filter_kernel("kernel_cuda_filter_copy_input", rect_size.x, rect_size.y, args);
929  }
930 
931 # if OPTIX_DENOISER_NO_PIXEL_STRIDE
932  device_only_memory<float> input_rgb(this, "denoiser input rgb", true);
933  input_rgb.alloc_to_device(rect_size.x * rect_size.y * 3 * task.denoising.input_passes);
934 
935  void *input_args[] = {&input_rgb.device_pointer,
936  &input_ptr,
937  &rect_size.x,
938  &rect_size.y,
939  &input_stride,
940  &task.pass_stride,
941  const_cast<int *>(pass_offset),
942  &task.denoising.input_passes,
943  &rtile.sample};
944  launch_filter_kernel(
945  "kernel_cuda_filter_convert_to_rgb", rect_size.x, rect_size.y, input_args);
946 
947  input_ptr = input_rgb.device_pointer;
948  pixel_stride = 3 * sizeof(float);
949  input_stride = rect_size.x * pixel_stride;
950 # endif
951 
952  const bool recreate_denoiser = (denoiser == NULL) ||
953  (task.denoising.input_passes != denoiser_input_passes);
954  if (recreate_denoiser) {
955  // Destroy existing handle before creating new one
956  if (denoiser != NULL) {
957  optixDenoiserDestroy(denoiser);
958  }
959 
960  // Create OptiX denoiser handle on demand when it is first used
961  OptixDenoiserOptions denoiser_options = {};
962  assert(task.denoising.input_passes >= 1 && task.denoising.input_passes <= 3);
963  denoiser_options.inputKind = static_cast<OptixDenoiserInputKind>(
964  OPTIX_DENOISER_INPUT_RGB + (task.denoising.input_passes - 1));
965 # if OPTIX_ABI_VERSION < 28
966  denoiser_options.pixelFormat = OPTIX_PIXEL_FORMAT_FLOAT3;
967 # endif
968  check_result_optix_ret(optixDenoiserCreate(context, &denoiser_options, &denoiser));
969  check_result_optix_ret(
970  optixDenoiserSetModel(denoiser, OPTIX_DENOISER_MODEL_KIND_HDR, NULL, 0));
971 
972  // OptiX denoiser handle was created with the requested number of input passes
973  denoiser_input_passes = task.denoising.input_passes;
974  }
975 
976  OptixDenoiserSizes sizes = {};
977  check_result_optix_ret(
978  optixDenoiserComputeMemoryResources(denoiser, rect_size.x, rect_size.y, &sizes));
979 
980 # if OPTIX_ABI_VERSION < 28
981  const size_t scratch_size = sizes.recommendedScratchSizeInBytes;
982 # else
983  const size_t scratch_size = sizes.withOverlapScratchSizeInBytes;
984 # endif
985  const size_t scratch_offset = sizes.stateSizeInBytes;
986 
987  // Allocate denoiser state if tile size has changed since last setup
988  if (recreate_denoiser || (denoiser_state.data_width != rect_size.x ||
989  denoiser_state.data_height != rect_size.y)) {
990  denoiser_state.alloc_to_device(scratch_offset + scratch_size);
991 
992  // Initialize denoiser state for the current tile size
993  check_result_optix_ret(optixDenoiserSetup(denoiser,
994  0,
995  rect_size.x,
996  rect_size.y,
997  denoiser_state.device_pointer,
998  scratch_offset,
999  denoiser_state.device_pointer + scratch_offset,
1000  scratch_size));
1001 
1002  denoiser_state.data_width = rect_size.x;
1003  denoiser_state.data_height = rect_size.y;
1004  }
1005 
1006  // Set up input and output layer information
1007  OptixImage2D input_layers[3] = {};
1008  OptixImage2D output_layers[1] = {};
1009 
1010  for (int i = 0; i < 3; ++i) {
1011 # if OPTIX_DENOISER_NO_PIXEL_STRIDE
1012  input_layers[i].data = input_ptr + (rect_size.x * rect_size.y * pixel_stride * i);
1013 # else
1014  input_layers[i].data = input_ptr + pass_offset[i];
1015 # endif
1016  input_layers[i].width = rect_size.x;
1017  input_layers[i].height = rect_size.y;
1018  input_layers[i].rowStrideInBytes = input_stride;
1019  input_layers[i].pixelStrideInBytes = pixel_stride;
1020  input_layers[i].format = OPTIX_PIXEL_FORMAT_FLOAT3;
1021  }
1022 
1023 # if OPTIX_DENOISER_NO_PIXEL_STRIDE
1024  output_layers[0].data = input_ptr;
1025  output_layers[0].width = rect_size.x;
1026  output_layers[0].height = rect_size.y;
1027  output_layers[0].rowStrideInBytes = input_stride;
1028  output_layers[0].pixelStrideInBytes = pixel_stride;
1029  int2 output_offset = overlap_offset;
1030  overlap_offset = make_int2(0, 0); // Not supported by denoiser API, so apply manually
1031 # else
1032  output_layers[0].data = target_tile.buffer + pixel_offset;
1033  output_layers[0].width = target_tile.w;
1034  output_layers[0].height = target_tile.h;
1035  output_layers[0].rowStrideInBytes = target_tile.stride * pixel_stride;
1036  output_layers[0].pixelStrideInBytes = pixel_stride;
1037 # endif
1038  output_layers[0].format = OPTIX_PIXEL_FORMAT_FLOAT3;
1039 
1040  // Finally run denonising
1041  OptixDenoiserParams params = {}; // All parameters are disabled/zero
1042  check_result_optix_ret(optixDenoiserInvoke(denoiser,
1043  0,
1044  &params,
1045  denoiser_state.device_pointer,
1046  scratch_offset,
1047  input_layers,
1048  task.denoising.input_passes,
1049  overlap_offset.x,
1050  overlap_offset.y,
1051  output_layers,
1052  denoiser_state.device_pointer + scratch_offset,
1053  scratch_size));
1054 
1055 # if OPTIX_DENOISER_NO_PIXEL_STRIDE
1056  void *output_args[] = {&input_ptr,
1057  &target_tile.buffer,
1058  &output_offset.x,
1059  &output_offset.y,
1060  &rect_size.x,
1061  &rect_size.y,
1062  &target_tile.x,
1063  &target_tile.y,
1064  &target_tile.w,
1065  &target_tile.h,
1066  &target_tile.offset,
1067  &target_tile.stride,
1068  &task.pass_stride,
1069  &rtile.sample};
1070  launch_filter_kernel(
1071  "kernel_cuda_filter_convert_from_rgb", target_tile.w, target_tile.h, output_args);
1072 # endif
1073 
1074  check_result_cuda_ret(cuStreamSynchronize(0));
1075 
1076  task.unmap_neighbor_tiles(neighbors, this);
1077  }
1078  else {
1079  // Run CUDA denoising kernels
1080  DenoisingTask denoising(this, task);
1081  CUDADevice::denoise(rtile, denoising);
1082  }
1083 
1084  // Update task progress after the denoiser completed processing
1085  task.update_progress(&rtile, rtile.w * rtile.h);
1086 
1087  return true;
1088  }
1089 
1090  void launch_shader_eval(DeviceTask &task, int thread_index)
1091  {
1092  unsigned int rgen_index = PG_BACK;
1093  if (task.shader_eval_type >= SHADER_EVAL_BAKE)
1094  rgen_index = PG_BAKE;
1095  if (task.shader_eval_type == SHADER_EVAL_DISPLACE)
1096  rgen_index = PG_DISP;
1097 
1098  const CUDAContextScope scope(cuContext);
1099 
1100  device_ptr launch_params_ptr = launch_params.device_pointer +
1101  thread_index * launch_params.data_elements;
1102 
1103  for (int sample = 0; sample < task.num_samples; ++sample) {
1104  ShaderParams params;
1105  params.input = (uint4 *)task.shader_input;
1106  params.output = (float4 *)task.shader_output;
1107  params.type = task.shader_eval_type;
1108  params.filter = task.shader_filter;
1109  params.sx = task.shader_x;
1110  params.offset = task.offset;
1111  params.sample = sample;
1112 
1113  check_result_cuda(cuMemcpyHtoDAsync(launch_params_ptr + offsetof(KernelParams, shader),
1114  &params,
1115  sizeof(params),
1116  cuda_stream[thread_index]));
1117 
1118  OptixShaderBindingTable sbt_params = {};
1119  sbt_params.raygenRecord = sbt_data.device_pointer + rgen_index * sizeof(SbtRecord);
1120  sbt_params.missRecordBase = sbt_data.device_pointer + PG_MISS * sizeof(SbtRecord);
1121  sbt_params.missRecordStrideInBytes = sizeof(SbtRecord);
1122  sbt_params.missRecordCount = 1;
1123  sbt_params.hitgroupRecordBase = sbt_data.device_pointer + PG_HITD * sizeof(SbtRecord);
1124  sbt_params.hitgroupRecordStrideInBytes = sizeof(SbtRecord);
1125 # if OPTIX_ABI_VERSION >= 36
1126  sbt_params.hitgroupRecordCount = 5; // PG_HITD(_MOTION), PG_HITS(_MOTION), PG_HITL
1127 # else
1128  sbt_params.hitgroupRecordCount = 3; // PG_HITD, PG_HITS, PG_HITL
1129 # endif
1130  sbt_params.callablesRecordBase = sbt_data.device_pointer + PG_CALL * sizeof(SbtRecord);
1131  sbt_params.callablesRecordCount = 3;
1132  sbt_params.callablesRecordStrideInBytes = sizeof(SbtRecord);
1133 
1134  check_result_optix(optixLaunch(pipelines[PIP_SHADER_EVAL],
1135  cuda_stream[thread_index],
1136  launch_params_ptr,
1137  launch_params.data_elements,
1138  &sbt_params,
1139  task.shader_w,
1140  1,
1141  1));
1142 
1143  check_result_cuda(cuStreamSynchronize(cuda_stream[thread_index]));
1144 
1145  task.update_progress(NULL);
1146  }
1147  }
1148 
1149  bool build_optix_bvh(BVHOptiX *bvh,
1150  OptixBuildOperation operation,
1151  const OptixBuildInput &build_input,
1152  uint16_t num_motion_steps)
1153  {
1154  /* Allocate and build acceleration structures only one at a time, to prevent parallel builds
1155  * from running out of memory (since both original and compacted acceleration structure memory
1156  * may be allocated at the same time for the duration of this function). The builds would
1157  * otherwise happen on the same CUDA stream anyway. */
1158  static thread_mutex mutex;
1159  thread_scoped_lock lock(mutex);
1160 
1161  const CUDAContextScope scope(cuContext);
1162 
1163  // Compute memory usage
1164  OptixAccelBufferSizes sizes = {};
1165  OptixAccelBuildOptions options = {};
1166  options.operation = operation;
1167  if (background) {
1168  // Prefer best performance and lowest memory consumption in background
1169  options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_TRACE | OPTIX_BUILD_FLAG_ALLOW_COMPACTION;
1170  }
1171  else {
1172  // Prefer fast updates in viewport
1173  options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_BUILD | OPTIX_BUILD_FLAG_ALLOW_UPDATE;
1174  }
1175 
1176  options.motionOptions.numKeys = num_motion_steps;
1177  options.motionOptions.flags = OPTIX_MOTION_FLAG_START_VANISH | OPTIX_MOTION_FLAG_END_VANISH;
1178  options.motionOptions.timeBegin = 0.0f;
1179  options.motionOptions.timeEnd = 1.0f;
1180 
1181  check_result_optix_ret(
1182  optixAccelComputeMemoryUsage(context, &options, &build_input, 1, &sizes));
1183 
1184  // Allocate required output buffers
1185  device_only_memory<char> temp_mem(this, "optix temp as build mem", true);
1186  temp_mem.alloc_to_device(align_up(sizes.tempSizeInBytes, 8) + 8);
1187  if (!temp_mem.device_pointer)
1188  return false; // Make sure temporary memory allocation succeeded
1189 
1190  // Acceleration structure memory has to be allocated on the device (not allowed to be on host)
1191  device_only_memory<char> &out_data = bvh->as_data;
1192  if (operation == OPTIX_BUILD_OPERATION_BUILD) {
1193  assert(out_data.device == this);
1194  out_data.alloc_to_device(sizes.outputSizeInBytes);
1195  if (!out_data.device_pointer)
1196  return false;
1197  }
1198  else {
1199  assert(out_data.device_pointer && out_data.device_size >= sizes.outputSizeInBytes);
1200  }
1201 
1202  // Finally build the acceleration structure
1203  OptixAccelEmitDesc compacted_size_prop = {};
1204  compacted_size_prop.type = OPTIX_PROPERTY_TYPE_COMPACTED_SIZE;
1205  // A tiny space was allocated for this property at the end of the temporary buffer above
1206  // Make sure this pointer is 8-byte aligned
1207  compacted_size_prop.result = align_up(temp_mem.device_pointer + sizes.tempSizeInBytes, 8);
1208 
1209  OptixTraversableHandle out_handle = 0;
1210  check_result_optix_ret(optixAccelBuild(context,
1211  NULL,
1212  &options,
1213  &build_input,
1214  1,
1215  temp_mem.device_pointer,
1216  sizes.tempSizeInBytes,
1217  out_data.device_pointer,
1218  sizes.outputSizeInBytes,
1219  &out_handle,
1220  background ? &compacted_size_prop : NULL,
1221  background ? 1 : 0));
1222  bvh->traversable_handle = static_cast<uint64_t>(out_handle);
1223 
1224  // Wait for all operations to finish
1225  check_result_cuda_ret(cuStreamSynchronize(NULL));
1226 
1227  // Compact acceleration structure to save memory (do not do this in viewport for faster builds)
1228  if (background) {
1229  uint64_t compacted_size = sizes.outputSizeInBytes;
1230  check_result_cuda_ret(
1231  cuMemcpyDtoH(&compacted_size, compacted_size_prop.result, sizeof(compacted_size)));
1232 
1233  // Temporary memory is no longer needed, so free it now to make space
1234  temp_mem.free();
1235 
1236  // There is no point compacting if the size does not change
1237  if (compacted_size < sizes.outputSizeInBytes) {
1238  device_only_memory<char> compacted_data(this, "optix compacted as", false);
1239  compacted_data.alloc_to_device(compacted_size);
1240  if (!compacted_data.device_pointer)
1241  // Do not compact if memory allocation for compacted acceleration structure fails
1242  // Can just use the uncompacted one then, so succeed here regardless
1243  return true;
1244 
1245  check_result_optix_ret(optixAccelCompact(context,
1246  NULL,
1247  out_handle,
1248  compacted_data.device_pointer,
1249  compacted_size,
1250  &out_handle));
1251  bvh->traversable_handle = static_cast<uint64_t>(out_handle);
1252 
1253  // Wait for compaction to finish
1254  check_result_cuda_ret(cuStreamSynchronize(NULL));
1255 
1256  std::swap(out_data.device_size, compacted_data.device_size);
1257  std::swap(out_data.device_pointer, compacted_data.device_pointer);
1258  // Original acceleration structure memory is freed when 'compacted_data' goes out of scope
1259  }
1260  }
1261 
1262  return true;
1263  }
1264 
1265  void build_bvh(BVH *bvh, Progress &progress, bool refit) override
1266  {
1267  if (bvh->params.bvh_layout == BVH_LAYOUT_BVH2) {
1268  /* For baking CUDA is used, build appropriate BVH for that. */
1269  Device::build_bvh(bvh, progress, refit);
1270  return;
1271  }
1272 
1273  free_bvh_memory_delayed();
1274 
1275  BVHOptiX *const bvh_optix = static_cast<BVHOptiX *>(bvh);
1276 
1277  progress.set_substatus("Building OptiX acceleration structure");
1278 
1279  if (!bvh->params.top_level) {
1280  assert(bvh->objects.size() == 1 && bvh->geometry.size() == 1);
1281 
1282  // Refit is only possible in viewport for now (because AS is built with
1283  // OPTIX_BUILD_FLAG_ALLOW_UPDATE only there, see above)
1284  OptixBuildOperation operation = OPTIX_BUILD_OPERATION_BUILD;
1285  if (refit && !background) {
1286  assert(bvh_optix->traversable_handle != 0);
1287  operation = OPTIX_BUILD_OPERATION_UPDATE;
1288  }
1289  else {
1290  bvh_optix->as_data.free();
1291  bvh_optix->traversable_handle = 0;
1292  }
1293 
1294  // Build bottom level acceleration structures (BLAS)
1295  Geometry *const geom = bvh->geometry[0];
1296  if (geom->geometry_type == Geometry::HAIR) {
1297  // Build BLAS for curve primitives
1298  Hair *const hair = static_cast<Hair *const>(geom);
1299  if (hair->num_curves() == 0) {
1300  return;
1301  }
1302 
1303  const size_t num_segments = hair->num_segments();
1304 
1305  size_t num_motion_steps = 1;
1307  if (motion_blur && hair->get_use_motion_blur() && motion_keys) {
1308  num_motion_steps = hair->get_motion_steps();
1309  }
1310 
1311  device_vector<OptixAabb> aabb_data(this, "optix temp aabb data", MEM_READ_ONLY);
1312 # if OPTIX_ABI_VERSION >= 36
1313  device_vector<int> index_data(this, "optix temp index data", MEM_READ_ONLY);
1314  device_vector<float4> vertex_data(this, "optix temp vertex data", MEM_READ_ONLY);
1315  // Four control points for each curve segment
1316  const size_t num_vertices = num_segments * 4;
1317  if (DebugFlags().optix.curves_api && hair->curve_shape == CURVE_THICK) {
1318  index_data.alloc(num_segments);
1319  vertex_data.alloc(num_vertices * num_motion_steps);
1320  }
1321  else
1322 # endif
1323  aabb_data.alloc(num_segments * num_motion_steps);
1324 
1325  // Get AABBs for each motion step
1326  for (size_t step = 0; step < num_motion_steps; ++step) {
1327  // The center step for motion vertices is not stored in the attribute
1328  const float3 *keys = hair->get_curve_keys().data();
1329  size_t center_step = (num_motion_steps - 1) / 2;
1330  if (step != center_step) {
1331  size_t attr_offset = (step > center_step) ? step - 1 : step;
1332  // Technically this is a float4 array, but sizeof(float3) == sizeof(float4)
1333  keys = motion_keys->data_float3() + attr_offset * hair->get_curve_keys().size();
1334  }
1335 
1336  for (size_t j = 0, i = 0; j < hair->num_curves(); ++j) {
1337  const Hair::Curve curve = hair->get_curve(j);
1338 # if OPTIX_ABI_VERSION >= 36
1339  const array<float> &curve_radius = hair->get_curve_radius();
1340 # endif
1341 
1342  for (int segment = 0; segment < curve.num_segments(); ++segment, ++i) {
1343 # if OPTIX_ABI_VERSION >= 36
1344  if (DebugFlags().optix.curves_api && hair->curve_shape == CURVE_THICK) {
1345  int k0 = curve.first_key + segment;
1346  int k1 = k0 + 1;
1347  int ka = max(k0 - 1, curve.first_key);
1348  int kb = min(k1 + 1, curve.first_key + curve.num_keys - 1);
1349 
1350  const float4 px = make_float4(keys[ka].x, keys[k0].x, keys[k1].x, keys[kb].x);
1351  const float4 py = make_float4(keys[ka].y, keys[k0].y, keys[k1].y, keys[kb].y);
1352  const float4 pz = make_float4(keys[ka].z, keys[k0].z, keys[k1].z, keys[kb].z);
1353  const float4 pw = make_float4(
1354  curve_radius[ka], curve_radius[k0], curve_radius[k1], curve_radius[kb]);
1355 
1356  // Convert Catmull-Rom data to Bezier spline
1357  static const float4 cr2bsp0 = make_float4(+7, -4, +5, -2) / 6.f;
1358  static const float4 cr2bsp1 = make_float4(-2, 11, -4, +1) / 6.f;
1359  static const float4 cr2bsp2 = make_float4(+1, -4, 11, -2) / 6.f;
1360  static const float4 cr2bsp3 = make_float4(-2, +5, -4, +7) / 6.f;
1361 
1362  index_data[i] = i * 4;
1363  float4 *const v = vertex_data.data() + step * num_vertices + index_data[i];
1364  v[0] = make_float4(
1365  dot(cr2bsp0, px), dot(cr2bsp0, py), dot(cr2bsp0, pz), dot(cr2bsp0, pw));
1366  v[1] = make_float4(
1367  dot(cr2bsp1, px), dot(cr2bsp1, py), dot(cr2bsp1, pz), dot(cr2bsp1, pw));
1368  v[2] = make_float4(
1369  dot(cr2bsp2, px), dot(cr2bsp2, py), dot(cr2bsp2, pz), dot(cr2bsp2, pw));
1370  v[3] = make_float4(
1371  dot(cr2bsp3, px), dot(cr2bsp3, py), dot(cr2bsp3, pz), dot(cr2bsp3, pw));
1372  }
1373  else
1374 # endif
1375  {
1377  curve.bounds_grow(segment, keys, hair->get_curve_radius().data(), bounds);
1378 
1379  const size_t index = step * num_segments + i;
1380  aabb_data[index].minX = bounds.min.x;
1381  aabb_data[index].minY = bounds.min.y;
1382  aabb_data[index].minZ = bounds.min.z;
1383  aabb_data[index].maxX = bounds.max.x;
1384  aabb_data[index].maxY = bounds.max.y;
1385  aabb_data[index].maxZ = bounds.max.z;
1386  }
1387  }
1388  }
1389  }
1390 
1391  // Upload AABB data to GPU
1392  aabb_data.copy_to_device();
1393 # if OPTIX_ABI_VERSION >= 36
1394  index_data.copy_to_device();
1395  vertex_data.copy_to_device();
1396 # endif
1397 
1398  vector<device_ptr> aabb_ptrs;
1399  aabb_ptrs.reserve(num_motion_steps);
1400 # if OPTIX_ABI_VERSION >= 36
1401  vector<device_ptr> width_ptrs;
1402  vector<device_ptr> vertex_ptrs;
1403  width_ptrs.reserve(num_motion_steps);
1404  vertex_ptrs.reserve(num_motion_steps);
1405 # endif
1406  for (size_t step = 0; step < num_motion_steps; ++step) {
1407  aabb_ptrs.push_back(aabb_data.device_pointer + step * num_segments * sizeof(OptixAabb));
1408 # if OPTIX_ABI_VERSION >= 36
1409  const device_ptr base_ptr = vertex_data.device_pointer +
1410  step * num_vertices * sizeof(float4);
1411  width_ptrs.push_back(base_ptr + 3 * sizeof(float)); // Offset by vertex size
1412  vertex_ptrs.push_back(base_ptr);
1413 # endif
1414  }
1415 
1416  // Force a single any-hit call, so shadow record-all behavior works correctly
1417  unsigned int build_flags = OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL;
1418  OptixBuildInput build_input = {};
1419 # if OPTIX_ABI_VERSION >= 36
1420  if (DebugFlags().optix.curves_api && hair->curve_shape == CURVE_THICK) {
1421  build_input.type = OPTIX_BUILD_INPUT_TYPE_CURVES;
1422  build_input.curveArray.curveType = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE;
1423  build_input.curveArray.numPrimitives = num_segments;
1424  build_input.curveArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data();
1425  build_input.curveArray.numVertices = num_vertices;
1426  build_input.curveArray.vertexStrideInBytes = sizeof(float4);
1427  build_input.curveArray.widthBuffers = (CUdeviceptr *)width_ptrs.data();
1428  build_input.curveArray.widthStrideInBytes = sizeof(float4);
1429  build_input.curveArray.indexBuffer = (CUdeviceptr)index_data.device_pointer;
1430  build_input.curveArray.indexStrideInBytes = sizeof(int);
1431  build_input.curveArray.flag = build_flags;
1432  build_input.curveArray.primitiveIndexOffset = hair->optix_prim_offset;
1433  }
1434  else
1435 # endif
1436  {
1437  // Disable visibility test any-hit program, since it is already checked during
1438  // intersection. Those trace calls that require anyhit can force it with a ray flag.
1439  build_flags |= OPTIX_GEOMETRY_FLAG_DISABLE_ANYHIT;
1440 
1441  build_input.type = OPTIX_BUILD_INPUT_TYPE_CUSTOM_PRIMITIVES;
1442 # if OPTIX_ABI_VERSION < 23
1443  build_input.aabbArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data();
1444  build_input.aabbArray.numPrimitives = num_segments;
1445  build_input.aabbArray.strideInBytes = sizeof(OptixAabb);
1446  build_input.aabbArray.flags = &build_flags;
1447  build_input.aabbArray.numSbtRecords = 1;
1448  build_input.aabbArray.primitiveIndexOffset = hair->optix_prim_offset;
1449 # else
1450  build_input.customPrimitiveArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data();
1451  build_input.customPrimitiveArray.numPrimitives = num_segments;
1452  build_input.customPrimitiveArray.strideInBytes = sizeof(OptixAabb);
1453  build_input.customPrimitiveArray.flags = &build_flags;
1454  build_input.customPrimitiveArray.numSbtRecords = 1;
1455  build_input.customPrimitiveArray.primitiveIndexOffset = hair->optix_prim_offset;
1456 # endif
1457  }
1458 
1459  if (!build_optix_bvh(bvh_optix, operation, build_input, num_motion_steps)) {
1460  progress.set_error("Failed to build OptiX acceleration structure");
1461  }
1462  }
1463  else if (geom->geometry_type == Geometry::MESH || geom->geometry_type == Geometry::VOLUME) {
1464  // Build BLAS for triangle primitives
1465  Mesh *const mesh = static_cast<Mesh *const>(geom);
1466  if (mesh->num_triangles() == 0) {
1467  return;
1468  }
1469 
1470  const size_t num_verts = mesh->get_verts().size();
1471 
1472  size_t num_motion_steps = 1;
1474  if (motion_blur && mesh->get_use_motion_blur() && motion_keys) {
1475  num_motion_steps = mesh->get_motion_steps();
1476  }
1477 
1478  device_vector<int> index_data(this, "optix temp index data", MEM_READ_ONLY);
1479  index_data.alloc(mesh->get_triangles().size());
1480  memcpy(index_data.data(),
1481  mesh->get_triangles().data(),
1482  mesh->get_triangles().size() * sizeof(int));
1483  device_vector<float3> vertex_data(this, "optix temp vertex data", MEM_READ_ONLY);
1484  vertex_data.alloc(num_verts * num_motion_steps);
1485 
1486  for (size_t step = 0; step < num_motion_steps; ++step) {
1487  const float3 *verts = mesh->get_verts().data();
1488 
1489  size_t center_step = (num_motion_steps - 1) / 2;
1490  // The center step for motion vertices is not stored in the attribute
1491  if (step != center_step) {
1492  verts = motion_keys->data_float3() +
1493  (step > center_step ? step - 1 : step) * num_verts;
1494  }
1495 
1496  memcpy(vertex_data.data() + num_verts * step, verts, num_verts * sizeof(float3));
1497  }
1498 
1499  // Upload triangle data to GPU
1500  index_data.copy_to_device();
1501  vertex_data.copy_to_device();
1502 
1503  vector<device_ptr> vertex_ptrs;
1504  vertex_ptrs.reserve(num_motion_steps);
1505  for (size_t step = 0; step < num_motion_steps; ++step) {
1506  vertex_ptrs.push_back(vertex_data.device_pointer + num_verts * step * sizeof(float3));
1507  }
1508 
1509  // Force a single any-hit call, so shadow record-all behavior works correctly
1510  unsigned int build_flags = OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL;
1511  OptixBuildInput build_input = {};
1512  build_input.type = OPTIX_BUILD_INPUT_TYPE_TRIANGLES;
1513  build_input.triangleArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data();
1514  build_input.triangleArray.numVertices = num_verts;
1515  build_input.triangleArray.vertexFormat = OPTIX_VERTEX_FORMAT_FLOAT3;
1516  build_input.triangleArray.vertexStrideInBytes = sizeof(float3);
1517  build_input.triangleArray.indexBuffer = index_data.device_pointer;
1518  build_input.triangleArray.numIndexTriplets = mesh->num_triangles();
1519  build_input.triangleArray.indexFormat = OPTIX_INDICES_FORMAT_UNSIGNED_INT3;
1520  build_input.triangleArray.indexStrideInBytes = 3 * sizeof(int);
1521  build_input.triangleArray.flags = &build_flags;
1522  // The SBT does not store per primitive data since Cycles already allocates separate
1523  // buffers for that purpose. OptiX does not allow this to be zero though, so just pass in
1524  // one and rely on that having the same meaning in this case.
1525  build_input.triangleArray.numSbtRecords = 1;
1526  build_input.triangleArray.primitiveIndexOffset = mesh->optix_prim_offset;
1527 
1528  if (!build_optix_bvh(bvh_optix, operation, build_input, num_motion_steps)) {
1529  progress.set_error("Failed to build OptiX acceleration structure");
1530  }
1531  }
1532  }
1533  else {
1534  unsigned int num_instances = 0;
1535  unsigned int max_num_instances = 0xFFFFFFFF;
1536 
1537  bvh_optix->as_data.free();
1538  bvh_optix->traversable_handle = 0;
1539  bvh_optix->motion_transform_data.free();
1540 
1541  optixDeviceContextGetProperty(context,
1542  OPTIX_DEVICE_PROPERTY_LIMIT_MAX_INSTANCE_ID,
1543  &max_num_instances,
1544  sizeof(max_num_instances));
1545  // Do not count first bit, which is used to distinguish instanced and non-instanced objects
1546  max_num_instances >>= 1;
1547  if (bvh->objects.size() > max_num_instances) {
1548  progress.set_error(
1549  "Failed to build OptiX acceleration structure because there are too many instances");
1550  return;
1551  }
1552 
1553  // Fill instance descriptions
1554 # if OPTIX_ABI_VERSION < 41
1555  device_vector<OptixAabb> aabbs(this, "optix tlas aabbs", MEM_READ_ONLY);
1556  aabbs.alloc(bvh->objects.size());
1557 # endif
1558  device_vector<OptixInstance> instances(this, "optix tlas instances", MEM_READ_ONLY);
1559  instances.alloc(bvh->objects.size());
1560 
1561  // Calculate total motion transform size and allocate memory for them
1562  size_t motion_transform_offset = 0;
1563  if (motion_blur) {
1564  size_t total_motion_transform_size = 0;
1565  for (Object *const ob : bvh->objects) {
1566  if (ob->is_traceable() && ob->use_motion()) {
1567  total_motion_transform_size = align_up(total_motion_transform_size,
1568  OPTIX_TRANSFORM_BYTE_ALIGNMENT);
1569  const size_t motion_keys = max(ob->get_motion().size(), 2) - 2;
1570  total_motion_transform_size = total_motion_transform_size +
1571  sizeof(OptixSRTMotionTransform) +
1572  motion_keys * sizeof(OptixSRTData);
1573  }
1574  }
1575 
1576  assert(bvh_optix->motion_transform_data.device == this);
1577  bvh_optix->motion_transform_data.alloc_to_device(total_motion_transform_size);
1578  }
1579 
1580  for (Object *ob : bvh->objects) {
1581  // Skip non-traceable objects
1582  if (!ob->is_traceable())
1583  continue;
1584 
1585  BVHOptiX *const blas = static_cast<BVHOptiX *>(ob->get_geometry()->bvh);
1586  OptixTraversableHandle handle = blas->traversable_handle;
1587 
1588 # if OPTIX_ABI_VERSION < 41
1589  OptixAabb &aabb = aabbs[num_instances];
1590  aabb.minX = ob->bounds.min.x;
1591  aabb.minY = ob->bounds.min.y;
1592  aabb.minZ = ob->bounds.min.z;
1593  aabb.maxX = ob->bounds.max.x;
1594  aabb.maxY = ob->bounds.max.y;
1595  aabb.maxZ = ob->bounds.max.z;
1596 # endif
1597 
1598  OptixInstance &instance = instances[num_instances++];
1599  memset(&instance, 0, sizeof(instance));
1600 
1601  // Clear transform to identity matrix
1602  instance.transform[0] = 1.0f;
1603  instance.transform[5] = 1.0f;
1604  instance.transform[10] = 1.0f;
1605 
1606  // Set user instance ID to object index (but leave low bit blank)
1607  instance.instanceId = ob->get_device_index() << 1;
1608 
1609  // Have to have at least one bit in the mask, or else instance would always be culled
1610  instance.visibilityMask = 1;
1611 
1612  if (ob->get_geometry()->has_volume) {
1613  // Volumes have a special bit set in the visibility mask so a trace can mask only volumes
1614  instance.visibilityMask |= 2;
1615  }
1616 
1617  if (ob->get_geometry()->geometry_type == Geometry::HAIR) {
1618  // Same applies to curves (so they can be skipped in local trace calls)
1619  instance.visibilityMask |= 4;
1620 
1621 # if OPTIX_ABI_VERSION >= 36
1622  if (motion_blur && ob->get_geometry()->has_motion_blur() &&
1623  DebugFlags().optix.curves_api &&
1624  static_cast<const Hair *>(ob->get_geometry())->curve_shape == CURVE_THICK) {
1625  // Select between motion blur and non-motion blur built-in intersection module
1626  instance.sbtOffset = PG_HITD_MOTION - PG_HITD;
1627  }
1628 # endif
1629  }
1630 
1631  // Insert motion traversable if object has motion
1632  if (motion_blur && ob->use_motion()) {
1633  size_t motion_keys = max(ob->get_motion().size(), 2) - 2;
1634  size_t motion_transform_size = sizeof(OptixSRTMotionTransform) +
1635  motion_keys * sizeof(OptixSRTData);
1636 
1637  const CUDAContextScope scope(cuContext);
1638 
1639  motion_transform_offset = align_up(motion_transform_offset,
1640  OPTIX_TRANSFORM_BYTE_ALIGNMENT);
1641  CUdeviceptr motion_transform_gpu = bvh_optix->motion_transform_data.device_pointer +
1642  motion_transform_offset;
1643  motion_transform_offset += motion_transform_size;
1644 
1645  // Allocate host side memory for motion transform and fill it with transform data
1646  OptixSRTMotionTransform &motion_transform = *reinterpret_cast<OptixSRTMotionTransform *>(
1647  new uint8_t[motion_transform_size]);
1648  motion_transform.child = handle;
1649  motion_transform.motionOptions.numKeys = ob->get_motion().size();
1650  motion_transform.motionOptions.flags = OPTIX_MOTION_FLAG_NONE;
1651  motion_transform.motionOptions.timeBegin = 0.0f;
1652  motion_transform.motionOptions.timeEnd = 1.0f;
1653 
1654  OptixSRTData *const srt_data = motion_transform.srtData;
1655  array<DecomposedTransform> decomp(ob->get_motion().size());
1657  decomp.data(), ob->get_motion().data(), ob->get_motion().size());
1658 
1659  for (size_t i = 0; i < ob->get_motion().size(); ++i) {
1660  // Scale
1661  srt_data[i].sx = decomp[i].y.w; // scale.x.x
1662  srt_data[i].sy = decomp[i].z.w; // scale.y.y
1663  srt_data[i].sz = decomp[i].w.w; // scale.z.z
1664 
1665  // Shear
1666  srt_data[i].a = decomp[i].z.x; // scale.x.y
1667  srt_data[i].b = decomp[i].z.y; // scale.x.z
1668  srt_data[i].c = decomp[i].w.x; // scale.y.z
1669  assert(decomp[i].z.z == 0.0f); // scale.y.x
1670  assert(decomp[i].w.y == 0.0f); // scale.z.x
1671  assert(decomp[i].w.z == 0.0f); // scale.z.y
1672 
1673  // Pivot point
1674  srt_data[i].pvx = 0.0f;
1675  srt_data[i].pvy = 0.0f;
1676  srt_data[i].pvz = 0.0f;
1677 
1678  // Rotation
1679  srt_data[i].qx = decomp[i].x.x;
1680  srt_data[i].qy = decomp[i].x.y;
1681  srt_data[i].qz = decomp[i].x.z;
1682  srt_data[i].qw = decomp[i].x.w;
1683 
1684  // Translation
1685  srt_data[i].tx = decomp[i].y.x;
1686  srt_data[i].ty = decomp[i].y.y;
1687  srt_data[i].tz = decomp[i].y.z;
1688  }
1689 
1690  // Upload motion transform to GPU
1691  cuMemcpyHtoD(motion_transform_gpu, &motion_transform, motion_transform_size);
1692  delete[] reinterpret_cast<uint8_t *>(&motion_transform);
1693 
1694  // Disable instance transform if object uses motion transform already
1695  instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
1696 
1697  // Get traversable handle to motion transform
1698  optixConvertPointerToTraversableHandle(context,
1699  motion_transform_gpu,
1700  OPTIX_TRAVERSABLE_TYPE_SRT_MOTION_TRANSFORM,
1701  &instance.traversableHandle);
1702  }
1703  else {
1704  instance.traversableHandle = handle;
1705 
1706  if (ob->get_geometry()->is_instanced()) {
1707  // Set transform matrix
1708  memcpy(instance.transform, &ob->get_tfm(), sizeof(instance.transform));
1709  }
1710  else {
1711  // Disable instance transform if geometry already has it applied to vertex data
1712  instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_TRANSFORM;
1713  // Non-instanced objects read ID from 'prim_object', so distinguish
1714  // them from instanced objects with the low bit set
1715  instance.instanceId |= 1;
1716  }
1717  }
1718  }
1719 
1720  // Upload instance descriptions
1721 # if OPTIX_ABI_VERSION < 41
1722  aabbs.resize(num_instances);
1723  aabbs.copy_to_device();
1724 # endif
1725  instances.resize(num_instances);
1726  instances.copy_to_device();
1727 
1728  // Build top-level acceleration structure (TLAS)
1729  OptixBuildInput build_input = {};
1730  build_input.type = OPTIX_BUILD_INPUT_TYPE_INSTANCES;
1731 # if OPTIX_ABI_VERSION < 41 // Instance AABBs no longer need to be set since OptiX 7.2
1732  build_input.instanceArray.aabbs = aabbs.device_pointer;
1733  build_input.instanceArray.numAabbs = num_instances;
1734 # endif
1735  build_input.instanceArray.instances = instances.device_pointer;
1736  build_input.instanceArray.numInstances = num_instances;
1737 
1738  if (!build_optix_bvh(bvh_optix, OPTIX_BUILD_OPERATION_BUILD, build_input, 0)) {
1739  progress.set_error("Failed to build OptiX acceleration structure");
1740  }
1741  tlas_handle = bvh_optix->traversable_handle;
1742  }
1743  }
1744 
1745  void release_optix_bvh(BVH *bvh) override
1746  {
1747  thread_scoped_lock lock(delayed_free_bvh_mutex);
1748  /* Do delayed free of BVH memory, since geometry holding BVH might be deleted
1749  * while GPU is still rendering. */
1750  BVHOptiX *const bvh_optix = static_cast<BVHOptiX *>(bvh);
1751 
1752  delayed_free_bvh_memory.emplace_back(std::move(bvh_optix->as_data));
1753  delayed_free_bvh_memory.emplace_back(std::move(bvh_optix->motion_transform_data));
1754  bvh_optix->traversable_handle = 0;
1755  }
1756 
1757  void free_bvh_memory_delayed()
1758  {
1759  thread_scoped_lock lock(delayed_free_bvh_mutex);
1760  delayed_free_bvh_memory.free_memory();
1761  }
1762 
1763  void const_copy_to(const char *name, void *host, size_t size) override
1764  {
1765  // Set constant memory for CUDA module
1766  // TODO(pmours): This is only used for tonemapping (see 'film_convert').
1767  // Could be removed by moving those functions to filter CUDA module.
1768  CUDADevice::const_copy_to(name, host, size);
1769 
1770  if (strcmp(name, "__data") == 0) {
1771  assert(size <= sizeof(KernelData));
1772 
1773  // Update traversable handle (since it is different for each device on multi devices)
1774  KernelData *const data = (KernelData *)host;
1775  *(OptixTraversableHandle *)&data->bvh.scene = tlas_handle;
1776 
1777  update_launch_params(offsetof(KernelParams, data), host, size);
1778  return;
1779  }
1780 
1781  // Update data storage pointers in launch parameters
1782 # define KERNEL_TEX(data_type, tex_name) \
1783  if (strcmp(name, #tex_name) == 0) { \
1784  update_launch_params(offsetof(KernelParams, tex_name), host, size); \
1785  return; \
1786  }
1787 # include "kernel/kernel_textures.h"
1788 # undef KERNEL_TEX
1789  }
1790 
1791  void update_launch_params(size_t offset, void *data, size_t data_size)
1792  {
1793  const CUDAContextScope scope(cuContext);
1794 
1795  for (int i = 0; i < info.cpu_threads; ++i)
1796  check_result_cuda(
1797  cuMemcpyHtoD(launch_params.device_pointer + i * launch_params.data_elements + offset,
1798  data,
1799  data_size));
1800  }
1801 
1802  void task_add(DeviceTask &task) override
1803  {
1804  // Upload texture information to device if it has changed since last launch
1805  load_texture_info();
1806 
1807  if (task.type == DeviceTask::FILM_CONVERT) {
1808  // Execute in main thread because of OpenGL access
1809  film_convert(task, task.buffer, task.rgba_byte, task.rgba_half);
1810  return;
1811  }
1812 
1813  if (task.type == DeviceTask::DENOISE_BUFFER) {
1814  // Execute denoising in a single thread (e.g. to avoid race conditions during creation)
1815  task_pool.push([=] {
1816  DeviceTask task_copy = task;
1817  thread_run(task_copy, 0);
1818  });
1819  return;
1820  }
1821 
1822  // Split task into smaller ones
1823  list<DeviceTask> tasks;
1824  task.split(tasks, info.cpu_threads);
1825 
1826  // Queue tasks in internal task pool
1827  int task_index = 0;
1828  for (DeviceTask &task : tasks) {
1829  task_pool.push([=] {
1830  // Using task index parameter instead of thread index, since number of CUDA streams may
1831  // differ from number of threads
1832  DeviceTask task_copy = task;
1833  thread_run(task_copy, task_index);
1834  });
1835  task_index++;
1836  }
1837  }
1838 
1839  void task_wait() override
1840  {
1841  // Wait for all queued tasks to finish
1842  task_pool.wait_work();
1843  }
1844 
1845  void task_cancel() override
1846  {
1847  // Cancel any remaining tasks in the internal pool
1848  task_pool.cancel();
1849  }
1850 };
1851 
1852 bool device_optix_init()
1853 {
1854  if (g_optixFunctionTable.optixDeviceContextCreate != NULL)
1855  return true; // Already initialized function table
1856 
1857  // Need to initialize CUDA as well
1858  if (!device_cuda_init())
1859  return false;
1860 
1861  const OptixResult result = optixInit();
1862 
1863  if (result == OPTIX_ERROR_UNSUPPORTED_ABI_VERSION) {
1864  VLOG(1) << "OptiX initialization failed because the installed NVIDIA driver is too old. "
1865  "Please update to the latest driver first!";
1866  return false;
1867  }
1868  else if (result != OPTIX_SUCCESS) {
1869  VLOG(1) << "OptiX initialization failed with error code " << (unsigned int)result;
1870  return false;
1871  }
1872 
1873  // Loaded OptiX successfully!
1874  return true;
1875 }
1876 
1878 {
1879  devices.reserve(cuda_devices.size());
1880 
1881  // Simply add all supported CUDA devices as OptiX devices again
1882  for (DeviceInfo info : cuda_devices) {
1883  assert(info.type == DEVICE_CUDA);
1884 
1885  int major;
1886  cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, info.num);
1887  if (major < 5) {
1888  continue; // Only Maxwell and up are supported by OptiX
1889  }
1890 
1891  info.type = DEVICE_OPTIX;
1892  info.id += "_OptiX";
1893  info.denoisers |= DENOISER_OPTIX;
1894  info.has_branched_path = false;
1895 
1896  devices.push_back(info);
1897  }
1898 }
1899 
1900 Device *device_optix_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
1901 {
1902  return new OptiXDevice(info, stats, profiler, background);
1903 }
1904 
1906 
1907 #endif
typedef float(TangentPoint)[2]
unsigned int uint
Definition: BLI_sys_types.h:83
ThreadMutex mutex
void swap(T &a, T &b)
Definition: Common.h:33
_GL_VOID GLfloat value _GL_VOID_RET _GL_VOID const GLuint GLboolean *residences _GL_BOOL_RET _GL_VOID GLsizei GLfloat GLfloat GLfloat GLfloat const GLubyte *bitmap _GL_VOID_RET _GL_VOID GLenum const void *lists _GL_VOID_RET _GL_VOID const GLdouble *equation _GL_VOID_RET _GL_VOID GLdouble GLdouble blue _GL_VOID_RET _GL_VOID GLfloat GLfloat blue _GL_VOID_RET _GL_VOID GLint GLint blue _GL_VOID_RET _GL_VOID GLshort GLshort blue _GL_VOID_RET _GL_VOID GLubyte GLubyte blue _GL_VOID_RET _GL_VOID GLuint GLuint blue _GL_VOID_RET _GL_VOID GLushort GLushort blue _GL_VOID_RET _GL_VOID GLbyte GLbyte GLbyte alpha _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble alpha _GL_VOID_RET _GL_VOID GLfloat GLfloat GLfloat alpha _GL_VOID_RET _GL_VOID GLint GLint GLint alpha _GL_VOID_RET _GL_VOID GLshort GLshort GLshort alpha _GL_VOID_RET _GL_VOID GLubyte GLubyte GLubyte alpha _GL_VOID_RET _GL_VOID GLuint GLuint GLuint alpha _GL_VOID_RET _GL_VOID GLushort GLushort GLushort alpha _GL_VOID_RET _GL_VOID GLenum mode _GL_VOID_RET _GL_VOID GLint GLsizei GLsizei GLenum type _GL_VOID_RET _GL_VOID GLsizei GLenum GLenum const void *pixels _GL_VOID_RET _GL_VOID const void *pointer _GL_VOID_RET _GL_VOID GLdouble v _GL_VOID_RET _GL_VOID GLfloat v _GL_VOID_RET _GL_VOID GLint GLint i2 _GL_VOID_RET _GL_VOID GLint j _GL_VOID_RET _GL_VOID GLfloat param _GL_VOID_RET _GL_VOID GLint param _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble GLdouble GLdouble zFar _GL_VOID_RET _GL_UINT GLdouble *equation _GL_VOID_RET _GL_VOID GLenum GLint *params _GL_VOID_RET _GL_VOID GLenum GLfloat *v _GL_VOID_RET _GL_VOID GLenum GLfloat *params _GL_VOID_RET _GL_VOID GLfloat *values _GL_VOID_RET _GL_VOID GLushort *values _GL_VOID_RET _GL_VOID GLenum GLfloat *params _GL_VOID_RET _GL_VOID GLenum GLdouble *params _GL_VOID_RET _GL_VOID GLenum GLint *params _GL_VOID_RET _GL_VOID GLsizei const void *pointer _GL_VOID_RET _GL_VOID GLsizei const void *pointer _GL_VOID_RET _GL_BOOL GLfloat param _GL_VOID_RET _GL_VOID GLint param _GL_VOID_RET _GL_VOID GLenum GLfloat param _GL_VOID_RET _GL_VOID GLenum GLint param _GL_VOID_RET _GL_VOID GLushort pattern _GL_VOID_RET _GL_VOID GLdouble GLdouble GLint GLint const GLdouble *points _GL_VOID_RET _GL_VOID GLdouble GLdouble GLint GLint GLdouble GLdouble GLint GLint const GLdouble *points _GL_VOID_RET _GL_VOID GLdouble GLdouble u2 _GL_VOID_RET _GL_VOID GLdouble GLdouble GLint GLdouble GLdouble v2 _GL_VOID_RET _GL_VOID GLenum GLfloat param _GL_VOID_RET _GL_VOID GLenum GLint param _GL_VOID_RET _GL_VOID GLenum mode _GL_VOID_RET _GL_VOID GLdouble GLdouble nz _GL_VOID_RET _GL_VOID GLfloat GLfloat nz _GL_VOID_RET _GL_VOID GLint GLint nz _GL_VOID_RET _GL_VOID GLshort GLshort nz _GL_VOID_RET _GL_VOID GLsizei const void *pointer _GL_VOID_RET _GL_VOID GLsizei const GLfloat *values _GL_VOID_RET _GL_VOID GLsizei const GLushort *values _GL_VOID_RET _GL_VOID GLint param _GL_VOID_RET _GL_VOID const GLuint const GLclampf *priorities _GL_VOID_RET _GL_VOID GLdouble y _GL_VOID_RET _GL_VOID GLfloat y _GL_VOID_RET _GL_VOID GLint y _GL_VOID_RET _GL_VOID GLshort y _GL_VOID_RET _GL_VOID GLdouble GLdouble z _GL_VOID_RET _GL_VOID GLfloat GLfloat z _GL_VOID_RET _GL_VOID GLint GLint z _GL_VOID_RET _GL_VOID GLshort GLshort z _GL_VOID_RET _GL_VOID GLdouble GLdouble z
_GL_VOID GLfloat value _GL_VOID_RET _GL_VOID const GLuint GLboolean *residences _GL_BOOL_RET _GL_VOID GLsizei GLfloat GLfloat GLfloat GLfloat const GLubyte *bitmap _GL_VOID_RET _GL_VOID GLenum type
_GL_VOID GLfloat value _GL_VOID_RET _GL_VOID const GLuint GLboolean *residences _GL_BOOL_RET _GL_VOID GLsizei GLfloat GLfloat GLfloat GLfloat const GLubyte *bitmap _GL_VOID_RET _GL_VOID GLenum const void *lists _GL_VOID_RET _GL_VOID const GLdouble *equation _GL_VOID_RET _GL_VOID GLdouble GLdouble blue _GL_VOID_RET _GL_VOID GLfloat GLfloat blue _GL_VOID_RET _GL_VOID GLint GLint blue _GL_VOID_RET _GL_VOID GLshort GLshort blue _GL_VOID_RET _GL_VOID GLubyte GLubyte blue _GL_VOID_RET _GL_VOID GLuint GLuint blue _GL_VOID_RET _GL_VOID GLushort GLushort blue _GL_VOID_RET _GL_VOID GLbyte GLbyte GLbyte alpha _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble alpha _GL_VOID_RET _GL_VOID GLfloat GLfloat GLfloat alpha _GL_VOID_RET _GL_VOID GLint GLint GLint alpha _GL_VOID_RET _GL_VOID GLshort GLshort GLshort alpha _GL_VOID_RET _GL_VOID GLubyte GLubyte GLubyte alpha _GL_VOID_RET _GL_VOID GLuint GLuint GLuint alpha _GL_VOID_RET _GL_VOID GLushort GLushort GLushort alpha _GL_VOID_RET _GL_VOID GLenum mode _GL_VOID_RET _GL_VOID GLint y
Group RGB to Bright Vector Camera Vector Combine Material Light Line Style Layer Add Ambient Diffuse Glossy Refraction Transparent Toon Principled Hair Volume Principled Light Particle Volume Image Sky Noise Wave Voronoi Brick Texture Vector Combine Vertex Separate Vector White RGB Map Separate Set Z Dilate Combine Combine Color Channel Split ID Combine Luminance Directional Alpha Distance Hue Movie Ellipse Bokeh View Corner DENOISE
ATTR_WARN_UNUSED_RESULT const BMVert * v
static DBVT_INLINE btScalar size(const btDbvtVolume &a)
Definition: btDbvt.cpp:52
static btDbvtVolume bounds(btDbvtNode **leaves, int count)
Definition: btDbvt.cpp:299
#define output
void refit(btStridingMeshInterface *triangles, const btVector3 &aabbMin, const btVector3 &aabbMax)
SIMD_FORCE_INLINE const btScalar & w() const
Return the w value.
Definition: btQuadWord.h:119
int BVHLayoutMask
Definition: bvh_params.h:39
Attribute * find(ustring name) const
Definition: attribute.cpp:447
float3 * data_float3()
Definition: attribute.h:86
BVHLayout bvh_layout
Definition: bvh_params.h:70
bool top_level
Definition: bvh_params.h:67
Definition: bvh/bvh.h:80
vector< Geometry * > geometry
Definition: bvh/bvh.h:83
BVHParams params
Definition: bvh/bvh.h:82
vector< Object * > objects
Definition: bvh/bvh.h:84
OptiX optix
Definition: util_debug.h:183
Definition: device.h:293
virtual void build_bvh(BVH *bvh, Progress &progress, bool refit)
Definition: device.cpp:369
Type geometry_type
Definition: geometry.h:78
@ MESH
Definition: geometry.h:73
@ VOLUME
Definition: geometry.h:75
@ HAIR
Definition: geometry.h:74
size_t optix_prim_offset
Definition: geometry.h:103
AttributeSet attributes
Definition: geometry.h:81
void set_substatus(const string &substatus_)
void set_error(const string &error_message_)
double render_time
Definition: buffers.h:82
static const int SIZE
Definition: buffers.h:173
static const int CENTER
Definition: buffers.h:174
int stride
Definition: buffers.h:143
int sample
Definition: buffers.h:140
RenderBuffers * buffers
Definition: buffers.h:152
int num_samples
Definition: buffers.h:139
@ PATH_TRACE
Definition: buffers.h:134
device_ptr buffer
Definition: buffers.h:146
Task task
Definition: buffers.h:136
int offset
Definition: buffers.h:142
int start_sample
Definition: buffers.h:138
int4 bounds() const
Definition: buffers.h:156
size_t data_height
void * host_pointer
device_ptr device_pointer
Device * device
size_t device_size
void alloc_to_device(size_t num, bool shrink_to_fit=true)
T * alloc(size_t width, size_t height=0, size_t depth=0)
void copy_to_device()
int x
Definition: btConvexHull.h:149
int w
Definition: btConvexHull.h:149
int y
Definition: btConvexHull.h:149
int z
Definition: btConvexHull.h:149
void free_memory()
Definition: util_vector.h:44
CCL_NAMESPACE_BEGIN struct Options options
Curve curve
@ DEVICE_CUDA
Definition: device.h:47
@ DEVICE_OPTIX
Definition: device.h:50
Device * device_optix_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
void device_optix_info(const vector< DeviceInfo > &cuda_devices, vector< DeviceInfo > &devices)
bool device_cuda_init()
bool device_optix_init()
@ MEM_READ_ONLY
Definition: device_memory.h:36
@ DENOISER_OPTIX
Definition: device_task.h:37
TaskPool * task_pool
static float verts[][3]
DO_INLINE void filter(lfVector *V, fmatrix3x3 *S)
uiWidgetBaseParameters params[MAX_WIDGET_BASE_BATCH]
unsigned long long CUtexObject
#define CCL_NAMESPACE_END
#define make_float4(x, y, z, w)
#define make_int2(x, y)
void KERNEL_FUNCTION_FULL_NAME() shader(KernelGlobals *kg, uint4 *input, float4 *output, int type, int filter, int i, int offset, int sample)
@ ATTR_STD_MOTION_VERTEX_POSITION
Definition: kernel_types.h:756
@ CURVE_THICK
Definition: kernel_types.h:715
@ BVH_LAYOUT_OPTIX
@ BVH_LAYOUT_BVH2
@ SHADER_EVAL_DISPLACE
Definition: kernel_types.h:197
@ SHADER_EVAL_BAKE
Definition: kernel_types.h:200
@ DENOISING_PASS_ALBEDO
Definition: kernel_types.h:411
@ DENOISING_PASS_COLOR
Definition: kernel_types.h:417
@ DENOISING_PASS_NORMAL
Definition: kernel_types.h:409
Segment< FEdge *, Vec3r > segment
static void sample(SocketReader *reader, int x, int y, float color[4])
Vector< CPUDevice > devices
list of all CPUDevices. for every hardware thread an instance of CPUDevice is created
struct blender::compositor::@172::@174 task
struct SELECTID_Context context
Definition: select_engine.c:47
#define min(a, b)
Definition: sort.c:51
unsigned short uint16_t
Definition: stdint.h:82
unsigned char uint8_t
Definition: stdint.h:81
unsigned __int64 uint64_t
Definition: stdint.h:93
float3 max
Definition: util_boundbox.h:34
float3 min
Definition: util_boundbox.h:34
Curve get_curve(size_t i) const
Definition: hair.h:119
size_t num_curves() const
Definition: hair.h:133
size_t num_segments() const
Definition: hair.h:138
CurveShapeType curve_shape
Definition: hair.h:99
float size[3]
size_t num_triangles() const
Definition: mesh.h:92
bool use_motion() const
Definition: object.cpp:238
NODE_DECLARE BoundBox bounds
Definition: object.h:56
int get_device_index() const
Definition: object.cpp:369
bool is_traceable() const
Definition: object.cpp:261
void * data
void push(TaskRunFunction &&task)
Definition: util_task.cpp:36
void cancel()
Definition: util_task.cpp:54
void wait_work(Summary *stats=NULL)
Definition: util_task.cpp:42
int offsets[9]
int y[4]
int strides[9]
int x[4]
long long int buffers[9]
uint start_sample
uint num_samples
ccl_global float * buffer
float z
Definition: sky_float3.h:35
float y
Definition: sky_float3.h:35
float x
Definition: sky_float3.h:35
float max
DebugFlags & DebugFlags()
Definition: util_debug.h:205
#define VLOG(severity)
Definition: util_logging.h:50
ccl_device_inline float dot(const float2 &a, const float2 &b)
size_t path_file_size(const string &path)
Definition: util_path.cpp:563
string path_get(const string &sub)
Definition: util_path.cpp:351
bool path_read_text(const string &path, string &text)
Definition: util_path.cpp:714
ccl_device_inline int4 rect_clip(int4 a, int4 b)
Definition: util_rect.h:38
ccl_device_inline int4 rect_expand(int4 rect, int d)
Definition: util_rect.h:32
ccl_device_inline int rect_size(int4 rect)
Definition: util_rect.h:65
CCL_NAMESPACE_BEGIN string string_printf(const char *format,...)
Definition: util_string.cpp:32
std::unique_lock< std::mutex > thread_scoped_lock
Definition: util_thread.h:41
CCL_NAMESPACE_BEGIN typedef std::mutex thread_mutex
Definition: util_thread.h:40
void transform_motion_decompose(DecomposedTransform *decomp, const Transform *motion, size_t size)
ccl_device_inline size_t align_up(size_t offset, size_t alignment)
Definition: util_types.h:65
uint64_t device_ptr
Definition: util_types.h:62