Blender V4.3
optix/device_impl.cpp
Go to the documentation of this file.
1/* SPDX-FileCopyrightText: 2019 NVIDIA Corporation
2 * SPDX-FileCopyrightText: 2019-2022 Blender Foundation
3 *
4 * SPDX-License-Identifier: Apache-2.0 */
5
6#ifdef WITH_OPTIX
7
9# include "device/optix/queue.h"
10
11# include "bvh/bvh.h"
12# include "bvh/optix.h"
13
14# include "scene/hair.h"
15# include "scene/mesh.h"
16# include "scene/object.h"
17# include "scene/pass.h"
18# include "scene/pointcloud.h"
19# include "scene/scene.h"
20
21# include "util/debug.h"
22# include "util/log.h"
23# include "util/md5.h"
24# include "util/path.h"
25# include "util/progress.h"
26# include "util/task.h"
27# include "util/time.h"
28
29# define __KERNEL_OPTIX__
31
33
34# if OPTIX_ABI_VERSION >= 55
35static void execute_optix_task(TaskPool &pool, OptixTask task, OptixResult &failure_reason)
36{
37 OptixTask additional_tasks[16];
38 unsigned int num_additional_tasks = 0;
39
40 const OptixResult result = optixTaskExecute(task, additional_tasks, 16, &num_additional_tasks);
41 if (result == OPTIX_SUCCESS) {
42 for (unsigned int i = 0; i < num_additional_tasks; ++i) {
44 &execute_optix_task, std::ref(pool), additional_tasks[i], std::ref(failure_reason)));
45 }
46 }
47 else {
48 failure_reason = result;
49 }
50}
51# endif
52
53OptiXDevice::OptiXDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler, bool headless)
54 : CUDADevice(info, stats, profiler, headless),
55 sbt_data(this, "__sbt", MEM_READ_ONLY),
56 launch_params(this, "kernel_params", false)
57{
58 /* Make the CUDA context current. */
59 if (!cuContext) {
60 /* Do not initialize if CUDA context creation failed already. */
61 return;
62 }
63 const CUDAContextScope scope(this);
64
65 /* Create OptiX context for this device. */
66 OptixDeviceContextOptions options = {};
67# ifdef WITH_CYCLES_LOGGING
68 options.logCallbackLevel = 4; /* Fatal = 1, Error = 2, Warning = 3, Print = 4. */
69 options.logCallbackFunction = [](unsigned int level, const char *, const char *message, void *) {
70 switch (level) {
71 case 1:
72 LOG_IF(FATAL, VLOG_IS_ON(1)) << message;
73 break;
74 case 2:
75 LOG_IF(ERROR, VLOG_IS_ON(1)) << message;
76 break;
77 case 3:
78 LOG_IF(WARNING, VLOG_IS_ON(1)) << message;
79 break;
80 case 4:
81 LOG_IF(INFO, VLOG_IS_ON(1)) << message;
82 break;
83 }
84 };
85# endif
86 if (DebugFlags().optix.use_debug) {
87 VLOG_INFO << "Using OptiX debug mode.";
88 options.validationMode = OPTIX_DEVICE_CONTEXT_VALIDATION_MODE_ALL;
89 }
90 optix_assert(optixDeviceContextCreate(cuContext, &options, &context));
91# ifdef WITH_CYCLES_LOGGING
92 optix_assert(optixDeviceContextSetLogCallback(
93 context, options.logCallbackFunction, options.logCallbackData, options.logCallbackLevel));
94# endif
95
96 /* Fix weird compiler bug that assigns wrong size. */
97 launch_params.data_elements = sizeof(KernelParamsOptiX);
98
99 /* Allocate launch parameter buffer memory on device. */
100 launch_params.alloc_to_device(1);
101}
102
103OptiXDevice::~OptiXDevice()
104{
105 /* Make CUDA context current. */
106 const CUDAContextScope scope(this);
107
108 free_bvh_memory_delayed();
109
110 sbt_data.free();
111 texture_info.free();
112 launch_params.free();
113
114 /* Unload modules. */
115 if (optix_module != NULL) {
116 optixModuleDestroy(optix_module);
117 }
118 for (int i = 0; i < 2; ++i) {
119 if (builtin_modules[i] != NULL) {
120 optixModuleDestroy(builtin_modules[i]);
121 }
122 }
123 for (int i = 0; i < NUM_PIPELINES; ++i) {
124 if (pipelines[i] != NULL) {
125 optixPipelineDestroy(pipelines[i]);
126 }
127 }
128 for (int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
129 if (groups[i] != NULL) {
130 optixProgramGroupDestroy(groups[i]);
131 }
132 }
133
134# ifdef WITH_OSL
135 for (const OptixModule &module : osl_modules) {
136 if (module != NULL) {
137 optixModuleDestroy(module);
138 }
139 }
140 for (const OptixProgramGroup &group : osl_groups) {
141 if (group != NULL) {
142 optixProgramGroupDestroy(group);
143 }
144 }
145# endif
146
147 optixDeviceContextDestroy(context);
148}
149
150unique_ptr<DeviceQueue> OptiXDevice::gpu_queue_create()
151{
152 return make_unique<OptiXDeviceQueue>(this);
153}
154
155BVHLayoutMask OptiXDevice::get_bvh_layout_mask(uint /*kernel_features*/) const
156{
157 /* OptiX has its own internal acceleration structure format. */
158 return BVH_LAYOUT_OPTIX;
159}
160
161static string get_optix_include_dir()
162{
163 const char *env_dir = getenv("OPTIX_ROOT_DIR");
164 const char *default_dir = CYCLES_RUNTIME_OPTIX_ROOT_DIR;
165
166 if (env_dir && env_dir[0]) {
167 const string env_include_dir = path_join(env_dir, "include");
168 return env_include_dir;
169 }
170 else if (default_dir[0]) {
171 const string default_include_dir = path_join(default_dir, "include");
172 return default_include_dir;
173 }
174
175 return string();
176}
177
178string OptiXDevice::compile_kernel_get_common_cflags(const uint kernel_features)
179{
180 string common_cflags = CUDADevice::compile_kernel_get_common_cflags(kernel_features);
181
182 /* Add OptiX SDK include directory to include paths. */
183 common_cflags += string_printf(" -I\"%s\"", get_optix_include_dir().c_str());
184
185 /* Specialization for shader ray-tracing. */
186 if (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
187 common_cflags += " --keep-device-functions";
188 }
189
190 return common_cflags;
191}
192
193bool OptiXDevice::load_kernels(const uint kernel_features)
194{
195 if (have_error()) {
196 /* Abort early if context creation failed already. */
197 return false;
198 }
199
200# ifdef WITH_OSL
201 const bool use_osl = (kernel_features & KERNEL_FEATURE_OSL);
202# else
203 const bool use_osl = false;
204# endif
205
206 /* Skip creating OptiX module if only doing denoising. */
207 const bool need_optix_kernels = (kernel_features &
209
210 /* Detect existence of OptiX kernel and SDK here early. So we can error out
211 * before compiling the CUDA kernels, to avoid failing right after when
212 * compiling the OptiX kernel. */
213 string suffix = use_osl ? "_osl" :
214 (kernel_features & (KERNEL_FEATURE_NODE_RAYTRACE | KERNEL_FEATURE_MNEE)) ?
215 "_shader_raytrace" :
216 "";
217 string ptx_filename;
218 if (need_optix_kernels) {
219 ptx_filename = path_get("lib/kernel_optix" + suffix + ".ptx.zst");
220 if (use_adaptive_compilation() || path_file_size(ptx_filename) == -1) {
221 std::string optix_include_dir = get_optix_include_dir();
222 if (optix_include_dir.empty()) {
223 set_error(
224 "Unable to compile OptiX kernels at runtime. Set OPTIX_ROOT_DIR environment variable "
225 "to a directory containing the OptiX SDK.");
226 return false;
227 }
228 else if (!path_is_directory(optix_include_dir)) {
229 set_error(string_printf(
230 "OptiX headers not found at %s, unable to compile OptiX kernels at runtime. Install "
231 "OptiX SDK in the specified location, or set OPTIX_ROOT_DIR environment variable to a "
232 "directory containing the OptiX SDK.",
233 optix_include_dir.c_str()));
234 return false;
235 }
236 }
237 }
238
239 /* Load CUDA modules because we need some of the utility kernels. */
240 if (!CUDADevice::load_kernels(kernel_features)) {
241 return false;
242 }
243
244 if (!need_optix_kernels) {
245 return true;
246 }
247
248 const CUDAContextScope scope(this);
249
250 /* Unload existing OptiX module and pipelines first. */
251 if (optix_module != NULL) {
252 optixModuleDestroy(optix_module);
253 optix_module = NULL;
254 }
255 for (int i = 0; i < 2; ++i) {
256 if (builtin_modules[i] != NULL) {
257 optixModuleDestroy(builtin_modules[i]);
258 builtin_modules[i] = NULL;
259 }
260 }
261 for (int i = 0; i < NUM_PIPELINES; ++i) {
262 if (pipelines[i] != NULL) {
263 optixPipelineDestroy(pipelines[i]);
264 pipelines[i] = NULL;
265 }
266 }
267 for (int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
268 if (groups[i] != NULL) {
269 optixProgramGroupDestroy(groups[i]);
270 groups[i] = NULL;
271 }
272 }
273
274# ifdef WITH_OSL
275 /* Recreating base OptiX module invalidates all OSL modules too, since they link against it. */
276 for (const OptixModule &module : osl_modules) {
277 if (module != NULL) {
278 optixModuleDestroy(module);
279 }
280 }
281 osl_modules.clear();
282
283 for (const OptixProgramGroup &group : osl_groups) {
284 if (group != NULL) {
285 optixProgramGroupDestroy(group);
286 }
287 }
288 osl_groups.clear();
289# endif
290
291 OptixModuleCompileOptions module_options = {};
292 module_options.maxRegisterCount = 0; /* Do not set an explicit register limit. */
293
294 if (DebugFlags().optix.use_debug) {
295 module_options.optLevel = OPTIX_COMPILE_OPTIMIZATION_LEVEL_0;
296 module_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_FULL;
297 }
298 else {
299 module_options.optLevel = OPTIX_COMPILE_OPTIMIZATION_LEVEL_3;
300 module_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_NONE;
301 }
302
303 module_options.boundValues = nullptr;
304 module_options.numBoundValues = 0;
305# if OPTIX_ABI_VERSION >= 55
306 module_options.payloadTypes = nullptr;
307 module_options.numPayloadTypes = 0;
308# endif
309
310 /* Default to no motion blur and two-level graph, since it is the fastest option. */
311 pipeline_options.usesMotionBlur = false;
312 pipeline_options.traversableGraphFlags =
313 OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_SINGLE_LEVEL_INSTANCING;
314 pipeline_options.numPayloadValues = 8;
315 pipeline_options.numAttributeValues = 2; /* u, v */
316 pipeline_options.exceptionFlags = OPTIX_EXCEPTION_FLAG_NONE;
317 pipeline_options.pipelineLaunchParamsVariableName = "kernel_params"; /* See globals.h */
318
319 pipeline_options.usesPrimitiveTypeFlags = OPTIX_PRIMITIVE_TYPE_FLAGS_TRIANGLE;
320 if (kernel_features & KERNEL_FEATURE_HAIR) {
321 if (kernel_features & KERNEL_FEATURE_HAIR_THICK) {
322# if OPTIX_ABI_VERSION >= 55
323 pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_CATMULLROM;
324# else
325 pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_ROUND_CUBIC_BSPLINE;
326# endif
327 }
328 else
329 pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_CUSTOM;
330 }
331 if (kernel_features & KERNEL_FEATURE_POINTCLOUD) {
332 pipeline_options.usesPrimitiveTypeFlags |= OPTIX_PRIMITIVE_TYPE_FLAGS_CUSTOM;
333 }
334
335 /* Keep track of whether motion blur is enabled, so to enable/disable motion in BVH builds
336 * This is necessary since objects may be reported to have motion if the Vector pass is
337 * active, but may still need to be rendered without motion blur if that isn't active as well. */
338 if (kernel_features & KERNEL_FEATURE_OBJECT_MOTION) {
339 pipeline_options.usesMotionBlur = true;
340 /* Motion blur can insert motion transforms into the traversal graph.
341 * It is no longer a two-level graph then, so need to set flags to allow any configuration. */
342 pipeline_options.traversableGraphFlags = OPTIX_TRAVERSABLE_GRAPH_FLAG_ALLOW_ANY;
343 }
344
345 { /* Load and compile PTX module with OptiX kernels. */
346 string ptx_data;
347 if (use_adaptive_compilation() || path_file_size(ptx_filename) == -1) {
348 string cflags = compile_kernel_get_common_cflags(kernel_features);
349 ptx_filename = compile_kernel(cflags, ("kernel" + suffix).c_str(), "optix", true);
350 }
351 if (ptx_filename.empty() || !path_read_compressed_text(ptx_filename, ptx_data)) {
352 set_error(string_printf("Failed to load OptiX kernel from '%s'", ptx_filename.c_str()));
353 return false;
354 }
355
356# if OPTIX_ABI_VERSION >= 84
357 OptixTask task = nullptr;
358 OptixResult result = optixModuleCreateWithTasks(context,
359 &module_options,
360 &pipeline_options,
361 ptx_data.data(),
362 ptx_data.size(),
363 nullptr,
364 nullptr,
365 &optix_module,
366 &task);
367 if (result == OPTIX_SUCCESS) {
369 execute_optix_task(pool, task, result);
370 pool.wait_work();
371 }
372# elif OPTIX_ABI_VERSION >= 55
373 OptixTask task = nullptr;
374 OptixResult result = optixModuleCreateFromPTXWithTasks(context,
375 &module_options,
376 &pipeline_options,
377 ptx_data.data(),
378 ptx_data.size(),
379 nullptr,
380 nullptr,
381 &optix_module,
382 &task);
383 if (result == OPTIX_SUCCESS) {
385 execute_optix_task(pool, task, result);
386 pool.wait_work();
387 }
388# else
389 const OptixResult result = optixModuleCreateFromPTX(context,
390 &module_options,
391 &pipeline_options,
392 ptx_data.data(),
393 ptx_data.size(),
394 nullptr,
395 0,
396 &optix_module);
397# endif
398 if (result != OPTIX_SUCCESS) {
399 set_error(string_printf("Failed to load OptiX kernel from '%s' (%s)",
400 ptx_filename.c_str(),
401 optixGetErrorName(result)));
402 return false;
403 }
404 }
405
406 /* Create program groups. */
407 OptixProgramGroupDesc group_descs[NUM_PROGRAM_GROUPS] = {};
408 OptixProgramGroupOptions group_options = {}; /* There are no options currently. */
409 group_descs[PG_RGEN_INTERSECT_CLOSEST].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
410 group_descs[PG_RGEN_INTERSECT_CLOSEST].raygen.module = optix_module;
411 group_descs[PG_RGEN_INTERSECT_CLOSEST].raygen.entryFunctionName =
412 "__raygen__kernel_optix_integrator_intersect_closest";
413 group_descs[PG_RGEN_INTERSECT_SHADOW].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
414 group_descs[PG_RGEN_INTERSECT_SHADOW].raygen.module = optix_module;
415 group_descs[PG_RGEN_INTERSECT_SHADOW].raygen.entryFunctionName =
416 "__raygen__kernel_optix_integrator_intersect_shadow";
417 group_descs[PG_RGEN_INTERSECT_SUBSURFACE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
418 group_descs[PG_RGEN_INTERSECT_SUBSURFACE].raygen.module = optix_module;
419 group_descs[PG_RGEN_INTERSECT_SUBSURFACE].raygen.entryFunctionName =
420 "__raygen__kernel_optix_integrator_intersect_subsurface";
421 group_descs[PG_RGEN_INTERSECT_VOLUME_STACK].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
422 group_descs[PG_RGEN_INTERSECT_VOLUME_STACK].raygen.module = optix_module;
423 group_descs[PG_RGEN_INTERSECT_VOLUME_STACK].raygen.entryFunctionName =
424 "__raygen__kernel_optix_integrator_intersect_volume_stack";
425 group_descs[PG_RGEN_INTERSECT_DEDICATED_LIGHT].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
426 group_descs[PG_RGEN_INTERSECT_DEDICATED_LIGHT].raygen.module = optix_module;
427 group_descs[PG_RGEN_INTERSECT_DEDICATED_LIGHT].raygen.entryFunctionName =
428 "__raygen__kernel_optix_integrator_intersect_dedicated_light";
429 group_descs[PG_MISS].kind = OPTIX_PROGRAM_GROUP_KIND_MISS;
430 group_descs[PG_MISS].miss.module = optix_module;
431 group_descs[PG_MISS].miss.entryFunctionName = "__miss__kernel_optix_miss";
432 group_descs[PG_HITD].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
433 group_descs[PG_HITD].hitgroup.moduleCH = optix_module;
434 group_descs[PG_HITD].hitgroup.entryFunctionNameCH = "__closesthit__kernel_optix_hit";
435 group_descs[PG_HITD].hitgroup.moduleAH = optix_module;
436 group_descs[PG_HITD].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_visibility_test";
437 group_descs[PG_HITS].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
438 group_descs[PG_HITS].hitgroup.moduleAH = optix_module;
439 group_descs[PG_HITS].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_shadow_all_hit";
440 group_descs[PG_HITV].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
441 group_descs[PG_HITV].hitgroup.moduleCH = optix_module;
442 group_descs[PG_HITV].hitgroup.entryFunctionNameCH = "__closesthit__kernel_optix_hit";
443 group_descs[PG_HITV].hitgroup.moduleAH = optix_module;
444 group_descs[PG_HITV].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_volume_test";
445
446 if (kernel_features & KERNEL_FEATURE_HAIR) {
447 if (kernel_features & KERNEL_FEATURE_HAIR_THICK) {
448 /* Built-in thick curve intersection. */
449 OptixBuiltinISOptions builtin_options = {};
450# if OPTIX_ABI_VERSION >= 55
451 builtin_options.builtinISModuleType = OPTIX_PRIMITIVE_TYPE_ROUND_CATMULLROM;
452 builtin_options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_TRACE |
453 OPTIX_BUILD_FLAG_ALLOW_COMPACTION |
454 OPTIX_BUILD_FLAG_ALLOW_UPDATE;
455 builtin_options.curveEndcapFlags = OPTIX_CURVE_ENDCAP_DEFAULT; /* Disable end-caps. */
456# else
457 builtin_options.builtinISModuleType = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE;
458# endif
459 builtin_options.usesMotionBlur = false;
460
461 optix_assert(optixBuiltinISModuleGet(
462 context, &module_options, &pipeline_options, &builtin_options, &builtin_modules[0]));
463
464 group_descs[PG_HITD].hitgroup.moduleIS = builtin_modules[0];
465 group_descs[PG_HITD].hitgroup.entryFunctionNameIS = nullptr;
466 group_descs[PG_HITS].hitgroup.moduleIS = builtin_modules[0];
467 group_descs[PG_HITS].hitgroup.entryFunctionNameIS = nullptr;
468
469 if (pipeline_options.usesMotionBlur) {
470 builtin_options.usesMotionBlur = true;
471
472 optix_assert(optixBuiltinISModuleGet(
473 context, &module_options, &pipeline_options, &builtin_options, &builtin_modules[1]));
474
475 group_descs[PG_HITD_MOTION] = group_descs[PG_HITD];
476 group_descs[PG_HITD_MOTION].hitgroup.moduleIS = builtin_modules[1];
477 group_descs[PG_HITS_MOTION] = group_descs[PG_HITS];
478 group_descs[PG_HITS_MOTION].hitgroup.moduleIS = builtin_modules[1];
479 }
480 }
481 else {
482 /* Custom ribbon intersection. */
483 group_descs[PG_HITD].hitgroup.moduleIS = optix_module;
484 group_descs[PG_HITS].hitgroup.moduleIS = optix_module;
485 group_descs[PG_HITD].hitgroup.entryFunctionNameIS = "__intersection__curve_ribbon";
486 group_descs[PG_HITS].hitgroup.entryFunctionNameIS = "__intersection__curve_ribbon";
487 }
488 }
489
490 if (kernel_features & KERNEL_FEATURE_POINTCLOUD) {
491 group_descs[PG_HITD_POINTCLOUD] = group_descs[PG_HITD];
492 group_descs[PG_HITD_POINTCLOUD].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
493 group_descs[PG_HITD_POINTCLOUD].hitgroup.moduleIS = optix_module;
494 group_descs[PG_HITD_POINTCLOUD].hitgroup.entryFunctionNameIS = "__intersection__point";
495 group_descs[PG_HITS_POINTCLOUD] = group_descs[PG_HITS];
496 group_descs[PG_HITS_POINTCLOUD].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
497 group_descs[PG_HITS_POINTCLOUD].hitgroup.moduleIS = optix_module;
498 group_descs[PG_HITS_POINTCLOUD].hitgroup.entryFunctionNameIS = "__intersection__point";
499 }
500
501 /* Add hit group for local intersections. */
503 group_descs[PG_HITL].kind = OPTIX_PROGRAM_GROUP_KIND_HITGROUP;
504 group_descs[PG_HITL].hitgroup.moduleAH = optix_module;
505 group_descs[PG_HITL].hitgroup.entryFunctionNameAH = "__anyhit__kernel_optix_local_hit";
506 }
507
508 /* Shader ray-tracing replaces some functions with direct callables. */
509 if (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
510 group_descs[PG_RGEN_SHADE_SURFACE_RAYTRACE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
511 group_descs[PG_RGEN_SHADE_SURFACE_RAYTRACE].raygen.module = optix_module;
512 group_descs[PG_RGEN_SHADE_SURFACE_RAYTRACE].raygen.entryFunctionName =
513 "__raygen__kernel_optix_integrator_shade_surface_raytrace";
514
515 /* Kernels with OSL support are built without SVM, so can skip those direct callables there. */
516 if (!use_osl) {
517 group_descs[PG_CALL_SVM_AO].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
518 group_descs[PG_CALL_SVM_AO].callables.moduleDC = optix_module;
519 group_descs[PG_CALL_SVM_AO].callables.entryFunctionNameDC = "__direct_callable__svm_node_ao";
520 group_descs[PG_CALL_SVM_BEVEL].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
521 group_descs[PG_CALL_SVM_BEVEL].callables.moduleDC = optix_module;
522 group_descs[PG_CALL_SVM_BEVEL].callables.entryFunctionNameDC =
523 "__direct_callable__svm_node_bevel";
524 }
525 }
526
527 if (kernel_features & KERNEL_FEATURE_MNEE) {
528 group_descs[PG_RGEN_SHADE_SURFACE_MNEE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
529 group_descs[PG_RGEN_SHADE_SURFACE_MNEE].raygen.module = optix_module;
530 group_descs[PG_RGEN_SHADE_SURFACE_MNEE].raygen.entryFunctionName =
531 "__raygen__kernel_optix_integrator_shade_surface_mnee";
532 }
533
534 /* OSL uses direct callables to execute, so shading needs to be done in OptiX if OSL is used. */
535 if (use_osl) {
536 group_descs[PG_RGEN_SHADE_BACKGROUND].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
537 group_descs[PG_RGEN_SHADE_BACKGROUND].raygen.module = optix_module;
538 group_descs[PG_RGEN_SHADE_BACKGROUND].raygen.entryFunctionName =
539 "__raygen__kernel_optix_integrator_shade_background";
540 group_descs[PG_RGEN_SHADE_LIGHT].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
541 group_descs[PG_RGEN_SHADE_LIGHT].raygen.module = optix_module;
542 group_descs[PG_RGEN_SHADE_LIGHT].raygen.entryFunctionName =
543 "__raygen__kernel_optix_integrator_shade_light";
544 group_descs[PG_RGEN_SHADE_SURFACE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
545 group_descs[PG_RGEN_SHADE_SURFACE].raygen.module = optix_module;
546 group_descs[PG_RGEN_SHADE_SURFACE].raygen.entryFunctionName =
547 "__raygen__kernel_optix_integrator_shade_surface";
548 group_descs[PG_RGEN_SHADE_VOLUME].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
549 group_descs[PG_RGEN_SHADE_VOLUME].raygen.module = optix_module;
550 group_descs[PG_RGEN_SHADE_VOLUME].raygen.entryFunctionName =
551 "__raygen__kernel_optix_integrator_shade_volume";
552 group_descs[PG_RGEN_SHADE_SHADOW].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
553 group_descs[PG_RGEN_SHADE_SHADOW].raygen.module = optix_module;
554 group_descs[PG_RGEN_SHADE_SHADOW].raygen.entryFunctionName =
555 "__raygen__kernel_optix_integrator_shade_shadow";
556 group_descs[PG_RGEN_SHADE_DEDICATED_LIGHT].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
557 group_descs[PG_RGEN_SHADE_DEDICATED_LIGHT].raygen.module = optix_module;
558 group_descs[PG_RGEN_SHADE_DEDICATED_LIGHT].raygen.entryFunctionName =
559 "__raygen__kernel_optix_integrator_shade_dedicated_light";
560 group_descs[PG_RGEN_EVAL_DISPLACE].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
561 group_descs[PG_RGEN_EVAL_DISPLACE].raygen.module = optix_module;
562 group_descs[PG_RGEN_EVAL_DISPLACE].raygen.entryFunctionName =
563 "__raygen__kernel_optix_shader_eval_displace";
564 group_descs[PG_RGEN_EVAL_BACKGROUND].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
565 group_descs[PG_RGEN_EVAL_BACKGROUND].raygen.module = optix_module;
566 group_descs[PG_RGEN_EVAL_BACKGROUND].raygen.entryFunctionName =
567 "__raygen__kernel_optix_shader_eval_background";
568 group_descs[PG_RGEN_EVAL_CURVE_SHADOW_TRANSPARENCY].kind = OPTIX_PROGRAM_GROUP_KIND_RAYGEN;
569 group_descs[PG_RGEN_EVAL_CURVE_SHADOW_TRANSPARENCY].raygen.module = optix_module;
570 group_descs[PG_RGEN_EVAL_CURVE_SHADOW_TRANSPARENCY].raygen.entryFunctionName =
571 "__raygen__kernel_optix_shader_eval_curve_shadow_transparency";
572 }
573
574 optix_assert(optixProgramGroupCreate(
575 context, group_descs, NUM_PROGRAM_GROUPS, &group_options, nullptr, 0, groups));
576
577 /* Get program stack sizes. */
578 OptixStackSizes stack_size[NUM_PROGRAM_GROUPS] = {};
579 /* Set up SBT, which in this case is used only to select between different programs. */
580 sbt_data.alloc(NUM_PROGRAM_GROUPS);
581 memset(sbt_data.host_pointer, 0, sizeof(SbtRecord) * NUM_PROGRAM_GROUPS);
582 for (int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
583 optix_assert(optixSbtRecordPackHeader(groups[i], &sbt_data[i]));
584# if OPTIX_ABI_VERSION >= 84
585 optix_assert(optixProgramGroupGetStackSize(groups[i], &stack_size[i], nullptr));
586# else
587 optix_assert(optixProgramGroupGetStackSize(groups[i], &stack_size[i]));
588# endif
589 }
590 sbt_data.copy_to_device(); /* Upload SBT to device. */
591
592 /* Calculate maximum trace continuation stack size. */
593 unsigned int trace_css = stack_size[PG_HITD].cssCH;
594 /* This is based on the maximum of closest-hit and any-hit/intersection programs. */
595 trace_css = std::max(trace_css, stack_size[PG_HITD].cssIS + stack_size[PG_HITD].cssAH);
596 trace_css = std::max(trace_css, stack_size[PG_HITS].cssIS + stack_size[PG_HITS].cssAH);
597 trace_css = std::max(trace_css, stack_size[PG_HITL].cssIS + stack_size[PG_HITL].cssAH);
598 trace_css = std::max(trace_css, stack_size[PG_HITV].cssIS + stack_size[PG_HITV].cssAH);
599 trace_css = std::max(trace_css,
600 stack_size[PG_HITD_MOTION].cssIS + stack_size[PG_HITD_MOTION].cssAH);
601 trace_css = std::max(trace_css,
602 stack_size[PG_HITS_MOTION].cssIS + stack_size[PG_HITS_MOTION].cssAH);
603 trace_css = std::max(
604 trace_css, stack_size[PG_HITD_POINTCLOUD].cssIS + stack_size[PG_HITD_POINTCLOUD].cssAH);
605 trace_css = std::max(
606 trace_css, stack_size[PG_HITS_POINTCLOUD].cssIS + stack_size[PG_HITS_POINTCLOUD].cssAH);
607
608 OptixPipelineLinkOptions link_options = {};
609 link_options.maxTraceDepth = 1;
610# if OPTIX_ABI_VERSION < 84
611 link_options.debugLevel = module_options.debugLevel;
612# endif
613
614 if (use_osl) {
615 /* Re-create OSL pipeline in case kernels are reloaded after it has been created before. */
616 load_osl_kernels();
617 }
618 else if (kernel_features & (KERNEL_FEATURE_NODE_RAYTRACE | KERNEL_FEATURE_MNEE)) {
619 /* Create shader ray-tracing and MNEE pipeline. */
620 vector<OptixProgramGroup> pipeline_groups;
621 pipeline_groups.reserve(NUM_PROGRAM_GROUPS);
622 if (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) {
623 pipeline_groups.push_back(groups[PG_RGEN_SHADE_SURFACE_RAYTRACE]);
624 pipeline_groups.push_back(groups[PG_CALL_SVM_AO]);
625 pipeline_groups.push_back(groups[PG_CALL_SVM_BEVEL]);
626 }
627 if (kernel_features & KERNEL_FEATURE_MNEE) {
628 pipeline_groups.push_back(groups[PG_RGEN_SHADE_SURFACE_MNEE]);
629 }
630 pipeline_groups.push_back(groups[PG_MISS]);
631 pipeline_groups.push_back(groups[PG_HITD]);
632 pipeline_groups.push_back(groups[PG_HITS]);
633 pipeline_groups.push_back(groups[PG_HITL]);
634 pipeline_groups.push_back(groups[PG_HITV]);
635 if (pipeline_options.usesMotionBlur) {
636 pipeline_groups.push_back(groups[PG_HITD_MOTION]);
637 pipeline_groups.push_back(groups[PG_HITS_MOTION]);
638 }
639 if (kernel_features & KERNEL_FEATURE_POINTCLOUD) {
640 pipeline_groups.push_back(groups[PG_HITD_POINTCLOUD]);
641 pipeline_groups.push_back(groups[PG_HITS_POINTCLOUD]);
642 }
643
644 optix_assert(optixPipelineCreate(context,
645 &pipeline_options,
646 &link_options,
647 pipeline_groups.data(),
648 pipeline_groups.size(),
649 nullptr,
650 0,
651 &pipelines[PIP_SHADE]));
652
653 /* Combine ray generation and trace continuation stack size. */
654 const unsigned int css = std::max(stack_size[PG_RGEN_SHADE_SURFACE_RAYTRACE].cssRG,
655 stack_size[PG_RGEN_SHADE_SURFACE_MNEE].cssRG) +
656 link_options.maxTraceDepth * trace_css;
657 const unsigned int dss = std::max(stack_size[PG_CALL_SVM_AO].dssDC,
658 stack_size[PG_CALL_SVM_BEVEL].dssDC);
659
660 /* Set stack size depending on pipeline options. */
661 optix_assert(optixPipelineSetStackSize(
662 pipelines[PIP_SHADE], 0, dss, css, pipeline_options.usesMotionBlur ? 3 : 2));
663 }
664
665 { /* Create intersection-only pipeline. */
666 vector<OptixProgramGroup> pipeline_groups;
667 pipeline_groups.reserve(NUM_PROGRAM_GROUPS);
668 pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_CLOSEST]);
669 pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_SHADOW]);
670 pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_SUBSURFACE]);
671 pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_VOLUME_STACK]);
672 pipeline_groups.push_back(groups[PG_RGEN_INTERSECT_DEDICATED_LIGHT]);
673 pipeline_groups.push_back(groups[PG_MISS]);
674 pipeline_groups.push_back(groups[PG_HITD]);
675 pipeline_groups.push_back(groups[PG_HITS]);
676 pipeline_groups.push_back(groups[PG_HITL]);
677 pipeline_groups.push_back(groups[PG_HITV]);
678 if (pipeline_options.usesMotionBlur) {
679 pipeline_groups.push_back(groups[PG_HITD_MOTION]);
680 pipeline_groups.push_back(groups[PG_HITS_MOTION]);
681 }
682 if (kernel_features & KERNEL_FEATURE_POINTCLOUD) {
683 pipeline_groups.push_back(groups[PG_HITD_POINTCLOUD]);
684 pipeline_groups.push_back(groups[PG_HITS_POINTCLOUD]);
685 }
686
687 optix_assert(optixPipelineCreate(context,
688 &pipeline_options,
689 &link_options,
690 pipeline_groups.data(),
691 pipeline_groups.size(),
692 nullptr,
693 0,
694 &pipelines[PIP_INTERSECT]));
695
696 /* Calculate continuation stack size based on the maximum of all ray generation stack sizes. */
697 const unsigned int css =
698 std::max(stack_size[PG_RGEN_INTERSECT_CLOSEST].cssRG,
699 std::max(stack_size[PG_RGEN_INTERSECT_SHADOW].cssRG,
700 std::max(stack_size[PG_RGEN_INTERSECT_SUBSURFACE].cssRG,
701 stack_size[PG_RGEN_INTERSECT_VOLUME_STACK].cssRG))) +
702 link_options.maxTraceDepth * trace_css;
703
704 optix_assert(optixPipelineSetStackSize(
705 pipelines[PIP_INTERSECT], 0, 0, css, pipeline_options.usesMotionBlur ? 3 : 2));
706 }
707
708 return !have_error();
709}
710
711bool OptiXDevice::load_osl_kernels()
712{
713# ifdef WITH_OSL
714 if (have_error()) {
715 return false;
716 }
717
718 struct OSLKernel {
719 string ptx;
720 string init_entry;
721 string exec_entry;
722 };
723
724 /* This has to be in the same order as the ShaderType enum, so that the index calculation in
725 * osl_eval_nodes checks out */
726 vector<OSLKernel> osl_kernels;
727
729 type = static_cast<ShaderType>(type + 1))
730 {
731 const vector<OSL::ShaderGroupRef> &groups = (type == SHADER_TYPE_SURFACE ?
732 osl_globals.surface_state :
733 type == SHADER_TYPE_VOLUME ?
734 osl_globals.volume_state :
736 osl_globals.displacement_state :
737 osl_globals.bump_state);
738 for (const OSL::ShaderGroupRef &group : groups) {
739 if (group) {
740 string osl_ptx, init_name, entry_name;
741 osl_globals.ss->getattribute(group.get(), "group_init_name", init_name);
742 osl_globals.ss->getattribute(group.get(), "group_entry_name", entry_name);
743 osl_globals.ss->getattribute(
744 group.get(), "ptx_compiled_version", OSL::TypeDesc::PTR, &osl_ptx);
745
746 int groupdata_size = 0;
747 osl_globals.ss->getattribute(group.get(), "llvm_groupdata_size", groupdata_size);
748 if (groupdata_size == 0) {
749 // Old attribute name from our patched OSL version as fallback.
750 osl_globals.ss->getattribute(group.get(), "groupdata_size", groupdata_size);
751 }
752 if (groupdata_size > 2048) { /* See 'group_data' array in kernel/osl/osl.h */
753 set_error(
754 string_printf("Requested OSL group data size (%d) is greater than the maximum "
755 "supported with OptiX (2048)",
756 groupdata_size));
757 return false;
758 }
759
760 osl_kernels.push_back({std::move(osl_ptx), std::move(init_name), std::move(entry_name)});
761 }
762 else {
763 /* Add empty entry for non-existent shader groups, so that the index stays stable. */
764 osl_kernels.emplace_back();
765 }
766 }
767 }
768
769 const CUDAContextScope scope(this);
770
771 if (pipelines[PIP_SHADE]) {
772 optixPipelineDestroy(pipelines[PIP_SHADE]);
773 }
774
775 for (OptixModule &module : osl_modules) {
776 if (module != NULL) {
777 optixModuleDestroy(module);
778 module = NULL;
779 }
780 }
781 for (OptixProgramGroup &group : osl_groups) {
782 if (group != NULL) {
783 optixProgramGroupDestroy(group);
784 group = NULL;
785 }
786 }
787
788 if (osl_kernels.empty()) {
789 /* No OSL shader groups, so no need to create a pipeline. */
790 return true;
791 }
792
793 OptixProgramGroupOptions group_options = {}; /* There are no options currently. */
794 OptixModuleCompileOptions module_options = {};
795 module_options.optLevel = OPTIX_COMPILE_OPTIMIZATION_LEVEL_3;
796 module_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_NONE;
797
798 osl_groups.resize(osl_kernels.size() * 2 + 1);
799 osl_modules.resize(osl_kernels.size() + 1);
800
801 { /* Load and compile PTX module with OSL services. */
802 string ptx_data, ptx_filename = path_get("lib/kernel_optix_osl_services.ptx.zst");
803 if (!path_read_compressed_text(ptx_filename, ptx_data)) {
804 set_error(string_printf("Failed to load OptiX OSL services kernel from '%s'",
805 ptx_filename.c_str()));
806 return false;
807 }
808
809# if OPTIX_ABI_VERSION >= 84
810 const OptixResult result = optixModuleCreate(context,
811 &module_options,
812 &pipeline_options,
813 ptx_data.data(),
814 ptx_data.size(),
815 nullptr,
816 0,
817 &osl_modules.back());
818# else
819 const OptixResult result = optixModuleCreateFromPTX(context,
820 &module_options,
821 &pipeline_options,
822 ptx_data.data(),
823 ptx_data.size(),
824 nullptr,
825 0,
826 &osl_modules.back());
827# endif
828 if (result != OPTIX_SUCCESS) {
829 set_error(string_printf("Failed to load OptiX OSL services kernel from '%s' (%s)",
830 ptx_filename.c_str(),
831 optixGetErrorName(result)));
832 return false;
833 }
834
835 OptixProgramGroupDesc group_desc = {};
836 group_desc.kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
837 group_desc.callables.entryFunctionNameDC = "__direct_callable__dummy_services";
838 group_desc.callables.moduleDC = osl_modules.back();
839
840 optix_assert(optixProgramGroupCreate(
841 context, &group_desc, 1, &group_options, nullptr, 0, &osl_groups.back()));
842 }
843
845 vector<OptixResult> results(osl_kernels.size(), OPTIX_SUCCESS);
846
847 for (size_t i = 0; i < osl_kernels.size(); ++i) {
848 if (osl_kernels[i].ptx.empty()) {
849 continue;
850 }
851
852# if OPTIX_ABI_VERSION >= 84
853 OptixTask task = nullptr;
854 results[i] = optixModuleCreateWithTasks(context,
855 &module_options,
856 &pipeline_options,
857 osl_kernels[i].ptx.data(),
858 osl_kernels[i].ptx.size(),
859 nullptr,
860 nullptr,
861 &osl_modules[i],
862 &task);
863 if (results[i] == OPTIX_SUCCESS) {
864 execute_optix_task(pool, task, results[i]);
865 }
866# elif OPTIX_ABI_VERSION >= 55
867 OptixTask task = nullptr;
868 results[i] = optixModuleCreateFromPTXWithTasks(context,
869 &module_options,
870 &pipeline_options,
871 osl_kernels[i].ptx.data(),
872 osl_kernels[i].ptx.size(),
873 nullptr,
874 nullptr,
875 &osl_modules[i],
876 &task);
877 if (results[i] == OPTIX_SUCCESS) {
878 execute_optix_task(pool, task, results[i]);
879 }
880# else
881 pool.push([this, &results, i, &module_options, &osl_kernels]() {
882 results[i] = optixModuleCreateFromPTX(context,
883 &module_options,
884 &pipeline_options,
885 osl_kernels[i].ptx.data(),
886 osl_kernels[i].ptx.size(),
887 nullptr,
888 0,
889 &osl_modules[i]);
890 });
891# endif
892 }
893
894 pool.wait_work();
895
896 for (size_t i = 0; i < osl_kernels.size(); ++i) {
897 if (osl_kernels[i].ptx.empty()) {
898 continue;
899 }
900
901 if (results[i] != OPTIX_SUCCESS) {
902 set_error(string_printf("Failed to load OptiX OSL kernel for %s (%s)",
903 osl_kernels[i].init_entry.c_str(),
904 optixGetErrorName(results[i])));
905 return false;
906 }
907
908 OptixProgramGroupDesc group_descs[2] = {};
909 group_descs[0].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
910 group_descs[0].callables.entryFunctionNameDC = osl_kernels[i].init_entry.c_str();
911 group_descs[0].callables.moduleDC = osl_modules[i];
912 group_descs[1].kind = OPTIX_PROGRAM_GROUP_KIND_CALLABLES;
913 group_descs[1].callables.entryFunctionNameDC = osl_kernels[i].exec_entry.c_str();
914 group_descs[1].callables.moduleDC = osl_modules[i];
915
916 optix_assert(optixProgramGroupCreate(
917 context, group_descs, 2, &group_options, nullptr, 0, &osl_groups[i * 2]));
918 }
919
920 /* Update SBT with new entries. */
921 sbt_data.alloc(NUM_PROGRAM_GROUPS + osl_groups.size());
922 for (int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
923 optix_assert(optixSbtRecordPackHeader(groups[i], &sbt_data[i]));
924 }
925 for (size_t i = 0; i < osl_groups.size(); ++i) {
926 if (osl_groups[i] != NULL) {
927 optix_assert(optixSbtRecordPackHeader(osl_groups[i], &sbt_data[NUM_PROGRAM_GROUPS + i]));
928 }
929 else {
930 /* Default to "__direct_callable__dummy_services", so that OSL evaluation for empty
931 * materials has direct callables to call and does not crash. */
932 optix_assert(optixSbtRecordPackHeader(osl_groups.back(), &sbt_data[NUM_PROGRAM_GROUPS + i]));
933 }
934 }
935 sbt_data.copy_to_device(); /* Upload updated SBT to device. */
936
937 OptixPipelineLinkOptions link_options = {};
938 link_options.maxTraceDepth = 0;
939# if OPTIX_ABI_VERSION < 84
940 link_options.debugLevel = OPTIX_COMPILE_DEBUG_LEVEL_NONE;
941# endif
942
943 {
944 vector<OptixProgramGroup> pipeline_groups;
945 pipeline_groups.reserve(NUM_PROGRAM_GROUPS);
946 pipeline_groups.push_back(groups[PG_RGEN_SHADE_BACKGROUND]);
947 pipeline_groups.push_back(groups[PG_RGEN_SHADE_LIGHT]);
948 pipeline_groups.push_back(groups[PG_RGEN_SHADE_SURFACE]);
949 pipeline_groups.push_back(groups[PG_RGEN_SHADE_SURFACE_RAYTRACE]);
950 pipeline_groups.push_back(groups[PG_RGEN_SHADE_SURFACE_MNEE]);
951 pipeline_groups.push_back(groups[PG_RGEN_SHADE_VOLUME]);
952 pipeline_groups.push_back(groups[PG_RGEN_SHADE_SHADOW]);
953 pipeline_groups.push_back(groups[PG_RGEN_SHADE_DEDICATED_LIGHT]);
954 pipeline_groups.push_back(groups[PG_RGEN_EVAL_DISPLACE]);
955 pipeline_groups.push_back(groups[PG_RGEN_EVAL_BACKGROUND]);
956 pipeline_groups.push_back(groups[PG_RGEN_EVAL_CURVE_SHADOW_TRANSPARENCY]);
957
958 for (const OptixProgramGroup &group : osl_groups) {
959 if (group != NULL) {
960 pipeline_groups.push_back(group);
961 }
962 }
963
964 optix_assert(optixPipelineCreate(context,
965 &pipeline_options,
966 &link_options,
967 pipeline_groups.data(),
968 pipeline_groups.size(),
969 nullptr,
970 0,
971 &pipelines[PIP_SHADE]));
972
973 /* Get program stack sizes. */
974 OptixStackSizes stack_size[NUM_PROGRAM_GROUPS] = {};
975 vector<OptixStackSizes> osl_stack_size(osl_groups.size());
976
977 for (int i = 0; i < NUM_PROGRAM_GROUPS; ++i) {
978# if OPTIX_ABI_VERSION >= 84
979 optix_assert(optixProgramGroupGetStackSize(groups[i], &stack_size[i], nullptr));
980# else
981 optix_assert(optixProgramGroupGetStackSize(groups[i], &stack_size[i]));
982# endif
983 }
984 for (size_t i = 0; i < osl_groups.size(); ++i) {
985 if (osl_groups[i] != NULL) {
986# if OPTIX_ABI_VERSION >= 84
987 optix_assert(optixProgramGroupGetStackSize(
988 osl_groups[i], &osl_stack_size[i], pipelines[PIP_SHADE]));
989# else
990 optix_assert(optixProgramGroupGetStackSize(osl_groups[i], &osl_stack_size[i]));
991# endif
992 }
993 }
994
995 const unsigned int css = std::max(stack_size[PG_RGEN_SHADE_SURFACE_RAYTRACE].cssRG,
996 stack_size[PG_RGEN_SHADE_SURFACE_MNEE].cssRG);
997 unsigned int dss = 0;
998 for (unsigned int i = 0; i < osl_stack_size.size(); ++i) {
999 dss = std::max(dss, osl_stack_size[i].dssDC);
1000 }
1001
1002 optix_assert(optixPipelineSetStackSize(
1003 pipelines[PIP_SHADE], 0, dss, css, pipeline_options.usesMotionBlur ? 3 : 2));
1004 }
1005
1006 return !have_error();
1007# else
1008 return false;
1009# endif
1010}
1011
1012void *OptiXDevice::get_cpu_osl_memory()
1013{
1014# ifdef WITH_OSL
1015 return &osl_globals;
1016# else
1017 return NULL;
1018# endif
1019}
1020
1021bool OptiXDevice::build_optix_bvh(BVHOptiX *bvh,
1022 OptixBuildOperation operation,
1023 const OptixBuildInput &build_input,
1024 uint16_t num_motion_steps)
1025{
1026 /* Allocate and build acceleration structures only one at a time, to prevent parallel builds
1027 * from running out of memory (since both original and compacted acceleration structure memory
1028 * may be allocated at the same time for the duration of this function). The builds would
1029 * otherwise happen on the same CUDA stream anyway. */
1030 static thread_mutex mutex;
1032
1033 const CUDAContextScope scope(this);
1034
1035 bool use_fast_trace_bvh = (bvh->params.bvh_type == BVH_TYPE_STATIC);
1036
1037 /* Compute memory usage. */
1038 OptixAccelBufferSizes sizes = {};
1039 OptixAccelBuildOptions options = {};
1040 options.operation = operation;
1041 if (build_input.type == OPTIX_BUILD_INPUT_TYPE_CURVES) {
1042 /* The build flags have to match the ones used to query the built-in curve intersection
1043 * program (see optixBuiltinISModuleGet above) */
1044 options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_TRACE | OPTIX_BUILD_FLAG_ALLOW_COMPACTION |
1045 OPTIX_BUILD_FLAG_ALLOW_UPDATE;
1046 use_fast_trace_bvh = true;
1047 }
1048 else if (use_fast_trace_bvh) {
1049 VLOG_INFO << "Using fast to trace OptiX BVH";
1050 options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_TRACE | OPTIX_BUILD_FLAG_ALLOW_COMPACTION;
1051 }
1052 else {
1053 VLOG_INFO << "Using fast to update OptiX BVH";
1054 options.buildFlags = OPTIX_BUILD_FLAG_PREFER_FAST_BUILD | OPTIX_BUILD_FLAG_ALLOW_UPDATE;
1055 }
1056
1057 options.motionOptions.numKeys = num_motion_steps;
1058 options.motionOptions.flags = OPTIX_MOTION_FLAG_START_VANISH | OPTIX_MOTION_FLAG_END_VANISH;
1059 options.motionOptions.timeBegin = 0.0f;
1060 options.motionOptions.timeEnd = 1.0f;
1061
1062 optix_assert(optixAccelComputeMemoryUsage(context, &options, &build_input, 1, &sizes));
1063
1064 /* Allocate required output buffers. */
1065 device_only_memory<char> temp_mem(this, "optix temp as build mem", true);
1066 temp_mem.alloc_to_device(align_up(sizes.tempSizeInBytes, 8) + 8);
1067 if (!temp_mem.device_pointer) {
1068 /* Make sure temporary memory allocation succeeded. */
1069 return false;
1070 }
1071
1072 /* Acceleration structure memory has to be allocated on the device (not allowed on the host). */
1073 device_only_memory<char> &out_data = *bvh->as_data;
1074 if (operation == OPTIX_BUILD_OPERATION_BUILD) {
1075 assert(out_data.device == this);
1076 out_data.alloc_to_device(sizes.outputSizeInBytes);
1077 if (!out_data.device_pointer) {
1078 return false;
1079 }
1080 }
1081 else {
1082 assert(out_data.device_pointer && out_data.device_size >= sizes.outputSizeInBytes);
1083 }
1084
1085 /* Finally build the acceleration structure. */
1086 OptixAccelEmitDesc compacted_size_prop = {};
1087 compacted_size_prop.type = OPTIX_PROPERTY_TYPE_COMPACTED_SIZE;
1088 /* A tiny space was allocated for this property at the end of the temporary buffer above.
1089 * Make sure this pointer is 8-byte aligned. */
1090 compacted_size_prop.result = align_up(temp_mem.device_pointer + sizes.tempSizeInBytes, 8);
1091
1092 OptixTraversableHandle out_handle = 0;
1093 optix_assert(optixAccelBuild(context,
1094 NULL,
1095 &options,
1096 &build_input,
1097 1,
1098 temp_mem.device_pointer,
1099 sizes.tempSizeInBytes,
1100 out_data.device_pointer,
1101 sizes.outputSizeInBytes,
1102 &out_handle,
1103 use_fast_trace_bvh ? &compacted_size_prop : NULL,
1104 use_fast_trace_bvh ? 1 : 0));
1105 bvh->traversable_handle = static_cast<uint64_t>(out_handle);
1106
1107 /* Wait for all operations to finish. */
1108 cuda_assert(cuStreamSynchronize(NULL));
1109
1110 /* Compact acceleration structure to save memory (do not do this in viewport for faster builds).
1111 */
1112 if (use_fast_trace_bvh) {
1113 uint64_t compacted_size = sizes.outputSizeInBytes;
1114 cuda_assert(cuMemcpyDtoH(&compacted_size, compacted_size_prop.result, sizeof(compacted_size)));
1115
1116 /* Temporary memory is no longer needed, so free it now to make space. */
1117 temp_mem.free();
1118
1119 /* There is no point compacting if the size does not change. */
1120 if (compacted_size < sizes.outputSizeInBytes) {
1121 device_only_memory<char> compacted_data(this, "optix compacted as", false);
1122 compacted_data.alloc_to_device(compacted_size);
1123 if (!compacted_data.device_pointer) {
1124 /* Do not compact if memory allocation for compacted acceleration structure fails.
1125 * Can just use the uncompacted one then, so succeed here regardless. */
1126 return !have_error();
1127 }
1128
1129 optix_assert(optixAccelCompact(
1130 context, NULL, out_handle, compacted_data.device_pointer, compacted_size, &out_handle));
1131 bvh->traversable_handle = static_cast<uint64_t>(out_handle);
1132
1133 /* Wait for compaction to finish. */
1134 cuda_assert(cuStreamSynchronize(NULL));
1135
1136 std::swap(out_data.device_size, compacted_data.device_size);
1137 std::swap(out_data.device_pointer, compacted_data.device_pointer);
1138 /* Original acceleration structure memory is freed when 'compacted_data' goes out of scope.
1139 */
1140 }
1141 }
1142
1143 return !have_error();
1144}
1145
1146void OptiXDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
1147{
1148 const bool use_fast_trace_bvh = (bvh->params.bvh_type == BVH_TYPE_STATIC);
1149
1150 free_bvh_memory_delayed();
1151
1152 BVHOptiX *const bvh_optix = static_cast<BVHOptiX *>(bvh);
1153
1154 progress.set_substatus("Building OptiX acceleration structure");
1155
1156 if (!bvh->params.top_level) {
1157 assert(bvh->objects.size() == 1 && bvh->geometry.size() == 1);
1158
1159 /* Refit is only possible in viewport for now (because AS is built with
1160 * OPTIX_BUILD_FLAG_ALLOW_UPDATE only there, see above). */
1161 OptixBuildOperation operation = OPTIX_BUILD_OPERATION_BUILD;
1162 if (refit && !use_fast_trace_bvh) {
1163 assert(bvh_optix->traversable_handle != 0);
1164 operation = OPTIX_BUILD_OPERATION_UPDATE;
1165 }
1166 else {
1167 bvh_optix->as_data->free();
1168 bvh_optix->traversable_handle = 0;
1169 }
1170
1171 /* Build bottom level acceleration structures (BLAS). */
1172 Geometry *const geom = bvh->geometry[0];
1173 if (geom->geometry_type == Geometry::HAIR) {
1174 /* Build BLAS for curve primitives. */
1175 Hair *const hair = static_cast<Hair *const>(geom);
1176 if (hair->num_segments() == 0) {
1177 return;
1178 }
1179
1180 const size_t num_segments = hair->num_segments();
1181
1182 size_t num_motion_steps = 1;
1184 if (pipeline_options.usesMotionBlur && hair->get_use_motion_blur() && motion_keys) {
1185 num_motion_steps = hair->get_motion_steps();
1186 }
1187
1188 device_vector<OptixAabb> aabb_data(this, "optix temp aabb data", MEM_READ_ONLY);
1189 device_vector<int> index_data(this, "optix temp index data", MEM_READ_ONLY);
1190 device_vector<float4> vertex_data(this, "optix temp vertex data", MEM_READ_ONLY);
1191 /* Four control points for each curve segment. */
1192 size_t num_vertices = num_segments * 4;
1193 if (hair->curve_shape == CURVE_THICK) {
1194# if OPTIX_ABI_VERSION >= 55
1195 num_vertices = hair->num_keys() + 2 * hair->num_curves();
1196# endif
1197 index_data.alloc(num_segments);
1198 vertex_data.alloc(num_vertices * num_motion_steps);
1199 }
1200 else {
1201 aabb_data.alloc(num_segments * num_motion_steps);
1202 }
1203
1204 /* Get AABBs for each motion step. */
1205 for (size_t step = 0; step < num_motion_steps; ++step) {
1206 /* The center step for motion vertices is not stored in the attribute. */
1207 const float3 *keys = hair->get_curve_keys().data();
1208 size_t center_step = (num_motion_steps - 1) / 2;
1209 if (step != center_step) {
1210 size_t attr_offset = (step > center_step) ? step - 1 : step;
1211 /* Technically this is a float4 array, but sizeof(float3) == sizeof(float4). */
1212 keys = motion_keys->data_float3() + attr_offset * hair->get_curve_keys().size();
1213 }
1214
1215# if OPTIX_ABI_VERSION >= 55
1216 if (hair->curve_shape == CURVE_THICK) {
1217 for (size_t curve_index = 0, segment_index = 0, vertex_index = step * num_vertices;
1218 curve_index < hair->num_curves();
1219 ++curve_index)
1220 {
1221 const Hair::Curve curve = hair->get_curve(curve_index);
1222 const array<float> &curve_radius = hair->get_curve_radius();
1223
1224 const int first_key_index = curve.first_key;
1225 {
1226 vertex_data[vertex_index++] = make_float4(keys[first_key_index].x,
1227 keys[first_key_index].y,
1228 keys[first_key_index].z,
1229 curve_radius[first_key_index]);
1230 }
1231
1232 for (int k = 0; k < curve.num_segments(); ++k) {
1233 if (step == 0) {
1234 index_data[segment_index++] = vertex_index - 1;
1235 }
1236 vertex_data[vertex_index++] = make_float4(keys[first_key_index + k].x,
1237 keys[first_key_index + k].y,
1238 keys[first_key_index + k].z,
1239 curve_radius[first_key_index + k]);
1240 }
1241
1242 const int last_key_index = first_key_index + curve.num_keys - 1;
1243 {
1244 vertex_data[vertex_index++] = make_float4(keys[last_key_index].x,
1245 keys[last_key_index].y,
1246 keys[last_key_index].z,
1247 curve_radius[last_key_index]);
1248 vertex_data[vertex_index++] = make_float4(keys[last_key_index].x,
1249 keys[last_key_index].y,
1250 keys[last_key_index].z,
1251 curve_radius[last_key_index]);
1252 }
1253 }
1254 }
1255 else
1256# endif
1257 {
1258 for (size_t curve_index = 0, i = 0; curve_index < hair->num_curves(); ++curve_index) {
1259 const Hair::Curve curve = hair->get_curve(curve_index);
1260
1261 for (int segment = 0; segment < curve.num_segments(); ++segment, ++i) {
1262# if OPTIX_ABI_VERSION < 55
1263 if (hair->curve_shape == CURVE_THICK) {
1264 const array<float> &curve_radius = hair->get_curve_radius();
1265
1266 int k0 = curve.first_key + segment;
1267 int k1 = k0 + 1;
1268 int ka = max(k0 - 1, curve.first_key);
1269 int kb = min(k1 + 1, curve.first_key + curve.num_keys - 1);
1270
1271 index_data[i] = i * 4;
1272 float4 *const v = vertex_data.data() + step * num_vertices + index_data[i];
1273
1274 const float4 px = make_float4(keys[ka].x, keys[k0].x, keys[k1].x, keys[kb].x);
1275 const float4 py = make_float4(keys[ka].y, keys[k0].y, keys[k1].y, keys[kb].y);
1276 const float4 pz = make_float4(keys[ka].z, keys[k0].z, keys[k1].z, keys[kb].z);
1277 const float4 pw = make_float4(
1278 curve_radius[ka], curve_radius[k0], curve_radius[k1], curve_radius[kb]);
1279
1280 /* Convert Catmull-Rom data to B-spline. */
1281 static const float4 cr2bsp0 = make_float4(+7, -4, +5, -2) / 6.f;
1282 static const float4 cr2bsp1 = make_float4(-2, 11, -4, +1) / 6.f;
1283 static const float4 cr2bsp2 = make_float4(+1, -4, 11, -2) / 6.f;
1284 static const float4 cr2bsp3 = make_float4(-2, +5, -4, +7) / 6.f;
1285
1286 v[0] = make_float4(
1287 dot(cr2bsp0, px), dot(cr2bsp0, py), dot(cr2bsp0, pz), dot(cr2bsp0, pw));
1288 v[1] = make_float4(
1289 dot(cr2bsp1, px), dot(cr2bsp1, py), dot(cr2bsp1, pz), dot(cr2bsp1, pw));
1290 v[2] = make_float4(
1291 dot(cr2bsp2, px), dot(cr2bsp2, py), dot(cr2bsp2, pz), dot(cr2bsp2, pw));
1292 v[3] = make_float4(
1293 dot(cr2bsp3, px), dot(cr2bsp3, py), dot(cr2bsp3, pz), dot(cr2bsp3, pw));
1294 }
1295 else
1296# endif
1297 {
1299 curve.bounds_grow(segment, keys, hair->get_curve_radius().data(), bounds);
1300
1301 const size_t index = step * num_segments + i;
1302 aabb_data[index].minX = bounds.min.x;
1303 aabb_data[index].minY = bounds.min.y;
1304 aabb_data[index].minZ = bounds.min.z;
1305 aabb_data[index].maxX = bounds.max.x;
1306 aabb_data[index].maxY = bounds.max.y;
1307 aabb_data[index].maxZ = bounds.max.z;
1308 }
1309 }
1310 }
1311 }
1312 }
1313
1314 /* Upload AABB data to GPU. */
1315 aabb_data.copy_to_device();
1316 index_data.copy_to_device();
1317 vertex_data.copy_to_device();
1318
1319 vector<device_ptr> aabb_ptrs;
1320 aabb_ptrs.reserve(num_motion_steps);
1321 vector<device_ptr> width_ptrs;
1322 vector<device_ptr> vertex_ptrs;
1323 width_ptrs.reserve(num_motion_steps);
1324 vertex_ptrs.reserve(num_motion_steps);
1325 for (size_t step = 0; step < num_motion_steps; ++step) {
1326 aabb_ptrs.push_back(aabb_data.device_pointer + step * num_segments * sizeof(OptixAabb));
1327 const device_ptr base_ptr = vertex_data.device_pointer +
1328 step * num_vertices * sizeof(float4);
1329 width_ptrs.push_back(base_ptr + 3 * sizeof(float)); /* Offset by vertex size. */
1330 vertex_ptrs.push_back(base_ptr);
1331 }
1332
1333 /* Force a single any-hit call, so shadow record-all behavior works correctly. */
1334 unsigned int build_flags = OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL;
1335 OptixBuildInput build_input = {};
1336 if (hair->curve_shape == CURVE_THICK) {
1337 build_input.type = OPTIX_BUILD_INPUT_TYPE_CURVES;
1338# if OPTIX_ABI_VERSION >= 55
1339 build_input.curveArray.curveType = OPTIX_PRIMITIVE_TYPE_ROUND_CATMULLROM;
1340# else
1341 build_input.curveArray.curveType = OPTIX_PRIMITIVE_TYPE_ROUND_CUBIC_BSPLINE;
1342# endif
1343 build_input.curveArray.numPrimitives = num_segments;
1344 build_input.curveArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data();
1345 build_input.curveArray.numVertices = num_vertices;
1346 build_input.curveArray.vertexStrideInBytes = sizeof(float4);
1347 build_input.curveArray.widthBuffers = (CUdeviceptr *)width_ptrs.data();
1348 build_input.curveArray.widthStrideInBytes = sizeof(float4);
1349 build_input.curveArray.indexBuffer = (CUdeviceptr)index_data.device_pointer;
1350 build_input.curveArray.indexStrideInBytes = sizeof(int);
1351 build_input.curveArray.flag = build_flags;
1352 build_input.curveArray.primitiveIndexOffset = hair->curve_segment_offset;
1353 }
1354 else {
1355 /* Disable visibility test any-hit program, since it is already checked during
1356 * intersection. Those trace calls that require any-hit can force it with a ray flag. */
1357 build_flags |= OPTIX_GEOMETRY_FLAG_DISABLE_ANYHIT;
1358
1359 build_input.type = OPTIX_BUILD_INPUT_TYPE_CUSTOM_PRIMITIVES;
1360 build_input.customPrimitiveArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data();
1361 build_input.customPrimitiveArray.numPrimitives = num_segments;
1362 build_input.customPrimitiveArray.strideInBytes = sizeof(OptixAabb);
1363 build_input.customPrimitiveArray.flags = &build_flags;
1364 build_input.customPrimitiveArray.numSbtRecords = 1;
1365 build_input.customPrimitiveArray.primitiveIndexOffset = hair->curve_segment_offset;
1366 }
1367
1368 if (!build_optix_bvh(bvh_optix, operation, build_input, num_motion_steps)) {
1369 progress.set_error("Failed to build OptiX acceleration structure");
1370 }
1371 }
1372 else if (geom->geometry_type == Geometry::MESH || geom->geometry_type == Geometry::VOLUME) {
1373 /* Build BLAS for triangle primitives. */
1374 Mesh *const mesh = static_cast<Mesh *const>(geom);
1375 if (mesh->num_triangles() == 0) {
1376 return;
1377 }
1378
1379 const size_t num_verts = mesh->get_verts().size();
1380
1381 size_t num_motion_steps = 1;
1383 if (pipeline_options.usesMotionBlur && mesh->get_use_motion_blur() && motion_keys) {
1384 num_motion_steps = mesh->get_motion_steps();
1385 }
1386
1387 device_vector<int> index_data(this, "optix temp index data", MEM_READ_ONLY);
1388 index_data.alloc(mesh->get_triangles().size());
1389 memcpy(index_data.data(),
1390 mesh->get_triangles().data(),
1391 mesh->get_triangles().size() * sizeof(int));
1392 device_vector<float4> vertex_data(this, "optix temp vertex data", MEM_READ_ONLY);
1393 vertex_data.alloc(num_verts * num_motion_steps);
1394
1395 for (size_t step = 0; step < num_motion_steps; ++step) {
1396 const float3 *verts = mesh->get_verts().data();
1397
1398 size_t center_step = (num_motion_steps - 1) / 2;
1399 /* The center step for motion vertices is not stored in the attribute. */
1400 if (step != center_step) {
1401 verts = motion_keys->data_float3() + (step > center_step ? step - 1 : step) * num_verts;
1402 }
1403
1404 memcpy(vertex_data.data() + num_verts * step, verts, num_verts * sizeof(float3));
1405 }
1406
1407 /* Upload triangle data to GPU. */
1408 index_data.copy_to_device();
1409 vertex_data.copy_to_device();
1410
1411 vector<device_ptr> vertex_ptrs;
1412 vertex_ptrs.reserve(num_motion_steps);
1413 for (size_t step = 0; step < num_motion_steps; ++step) {
1414 vertex_ptrs.push_back(vertex_data.device_pointer + num_verts * step * sizeof(float3));
1415 }
1416
1417 /* Force a single any-hit call, so shadow record-all behavior works correctly. */
1418 unsigned int build_flags = OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL;
1419 OptixBuildInput build_input = {};
1420 build_input.type = OPTIX_BUILD_INPUT_TYPE_TRIANGLES;
1421 build_input.triangleArray.vertexBuffers = (CUdeviceptr *)vertex_ptrs.data();
1422 build_input.triangleArray.numVertices = num_verts;
1423 build_input.triangleArray.vertexFormat = OPTIX_VERTEX_FORMAT_FLOAT3;
1424 build_input.triangleArray.vertexStrideInBytes = sizeof(float4);
1425 build_input.triangleArray.indexBuffer = index_data.device_pointer;
1426 build_input.triangleArray.numIndexTriplets = mesh->num_triangles();
1427 build_input.triangleArray.indexFormat = OPTIX_INDICES_FORMAT_UNSIGNED_INT3;
1428 build_input.triangleArray.indexStrideInBytes = 3 * sizeof(int);
1429 build_input.triangleArray.flags = &build_flags;
1430 /* The SBT does not store per primitive data since Cycles already allocates separate
1431 * buffers for that purpose. OptiX does not allow this to be zero though, so just pass in
1432 * one and rely on that having the same meaning in this case. */
1433 build_input.triangleArray.numSbtRecords = 1;
1434 build_input.triangleArray.primitiveIndexOffset = mesh->prim_offset;
1435
1436 if (!build_optix_bvh(bvh_optix, operation, build_input, num_motion_steps)) {
1437 progress.set_error("Failed to build OptiX acceleration structure");
1438 }
1439 }
1440 else if (geom->geometry_type == Geometry::POINTCLOUD) {
1441 /* Build BLAS for points primitives. */
1442 PointCloud *const pointcloud = static_cast<PointCloud *const>(geom);
1443 const size_t num_points = pointcloud->num_points();
1444 if (num_points == 0) {
1445 return;
1446 }
1447
1448 size_t num_motion_steps = 1;
1449 Attribute *motion_points = pointcloud->attributes.find(ATTR_STD_MOTION_VERTEX_POSITION);
1450 if (pipeline_options.usesMotionBlur && pointcloud->get_use_motion_blur() && motion_points) {
1451 num_motion_steps = pointcloud->get_motion_steps();
1452 }
1453
1454 device_vector<OptixAabb> aabb_data(this, "optix temp aabb data", MEM_READ_ONLY);
1455 aabb_data.alloc(num_points * num_motion_steps);
1456
1457 /* Get AABBs for each motion step. */
1458 for (size_t step = 0; step < num_motion_steps; ++step) {
1459 /* The center step for motion vertices is not stored in the attribute. */
1460 size_t center_step = (num_motion_steps - 1) / 2;
1461
1462 if (step == center_step) {
1463 const float3 *points = pointcloud->get_points().data();
1464 const float *radius = pointcloud->get_radius().data();
1465
1466 for (size_t i = 0; i < num_points; ++i) {
1467 const PointCloud::Point point = pointcloud->get_point(i);
1469 point.bounds_grow(points, radius, bounds);
1470
1471 const size_t index = step * num_points + i;
1472 aabb_data[index].minX = bounds.min.x;
1473 aabb_data[index].minY = bounds.min.y;
1474 aabb_data[index].minZ = bounds.min.z;
1475 aabb_data[index].maxX = bounds.max.x;
1476 aabb_data[index].maxY = bounds.max.y;
1477 aabb_data[index].maxZ = bounds.max.z;
1478 }
1479 }
1480 else {
1481 size_t attr_offset = (step > center_step) ? step - 1 : step;
1482 const float4 *points = motion_points->data_float4() + attr_offset * num_points;
1483
1484 for (size_t i = 0; i < num_points; ++i) {
1485 const PointCloud::Point point = pointcloud->get_point(i);
1487 point.bounds_grow(points[i], bounds);
1488
1489 const size_t index = step * num_points + i;
1490 aabb_data[index].minX = bounds.min.x;
1491 aabb_data[index].minY = bounds.min.y;
1492 aabb_data[index].minZ = bounds.min.z;
1493 aabb_data[index].maxX = bounds.max.x;
1494 aabb_data[index].maxY = bounds.max.y;
1495 aabb_data[index].maxZ = bounds.max.z;
1496 }
1497 }
1498 }
1499
1500 /* Upload AABB data to GPU. */
1501 aabb_data.copy_to_device();
1502
1503 vector<device_ptr> aabb_ptrs;
1504 aabb_ptrs.reserve(num_motion_steps);
1505 for (size_t step = 0; step < num_motion_steps; ++step) {
1506 aabb_ptrs.push_back(aabb_data.device_pointer + step * num_points * sizeof(OptixAabb));
1507 }
1508
1509 /* Disable visibility test any-hit program, since it is already checked during
1510 * intersection. Those trace calls that require anyhit can force it with a ray flag.
1511 * For those, force a single any-hit call, so shadow record-all behavior works correctly. */
1512 unsigned int build_flags = OPTIX_GEOMETRY_FLAG_DISABLE_ANYHIT |
1513 OPTIX_GEOMETRY_FLAG_REQUIRE_SINGLE_ANYHIT_CALL;
1514 OptixBuildInput build_input = {};
1515 build_input.type = OPTIX_BUILD_INPUT_TYPE_CUSTOM_PRIMITIVES;
1516# if OPTIX_ABI_VERSION < 23
1517 build_input.aabbArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data();
1518 build_input.aabbArray.numPrimitives = num_points;
1519 build_input.aabbArray.strideInBytes = sizeof(OptixAabb);
1520 build_input.aabbArray.flags = &build_flags;
1521 build_input.aabbArray.numSbtRecords = 1;
1522 build_input.aabbArray.primitiveIndexOffset = pointcloud->prim_offset;
1523# else
1524 build_input.customPrimitiveArray.aabbBuffers = (CUdeviceptr *)aabb_ptrs.data();
1525 build_input.customPrimitiveArray.numPrimitives = num_points;
1526 build_input.customPrimitiveArray.strideInBytes = sizeof(OptixAabb);
1527 build_input.customPrimitiveArray.flags = &build_flags;
1528 build_input.customPrimitiveArray.numSbtRecords = 1;
1529 build_input.customPrimitiveArray.primitiveIndexOffset = pointcloud->prim_offset;
1530# endif
1531
1532 if (!build_optix_bvh(bvh_optix, operation, build_input, num_motion_steps)) {
1533 progress.set_error("Failed to build OptiX acceleration structure");
1534 }
1535 }
1536 }
1537 else {
1538 unsigned int num_instances = 0;
1539 unsigned int max_num_instances = 0xFFFFFFFF;
1540
1541 bvh_optix->as_data->free();
1542 bvh_optix->traversable_handle = 0;
1543 bvh_optix->motion_transform_data->free();
1544
1545 optixDeviceContextGetProperty(context,
1546 OPTIX_DEVICE_PROPERTY_LIMIT_MAX_INSTANCE_ID,
1547 &max_num_instances,
1548 sizeof(max_num_instances));
1549 /* Do not count first bit, which is used to distinguish instanced and non-instanced objects. */
1550 max_num_instances >>= 1;
1551 if (bvh->objects.size() > max_num_instances) {
1552 progress.set_error(
1553 "Failed to build OptiX acceleration structure because there are too many instances");
1554 return;
1555 }
1556
1557 /* Fill instance descriptions. */
1558 device_vector<OptixInstance> instances(this, "optix tlas instances", MEM_READ_ONLY);
1559 instances.alloc(bvh->objects.size());
1560
1561 /* Calculate total motion transform size and allocate memory for them. */
1562 size_t motion_transform_offset = 0;
1563 if (pipeline_options.usesMotionBlur) {
1564 size_t total_motion_transform_size = 0;
1565 for (Object *const ob : bvh->objects) {
1566 if (ob->is_traceable() && ob->use_motion()) {
1567 total_motion_transform_size = align_up(total_motion_transform_size,
1568 OPTIX_TRANSFORM_BYTE_ALIGNMENT);
1569 const size_t motion_keys = max(ob->get_motion().size(), (size_t)2) - 2;
1570 total_motion_transform_size = total_motion_transform_size +
1571 sizeof(OptixSRTMotionTransform) +
1572 motion_keys * sizeof(OptixSRTData);
1573 }
1574 }
1575
1576 assert(bvh_optix->motion_transform_data->device == this);
1577 bvh_optix->motion_transform_data->alloc_to_device(total_motion_transform_size);
1578 }
1579
1580 for (Object *ob : bvh->objects) {
1581 /* Skip non-traceable objects. */
1582 if (!ob->is_traceable()) {
1583 continue;
1584 }
1585
1586 BVHOptiX *const blas = static_cast<BVHOptiX *>(ob->get_geometry()->bvh);
1587 OptixTraversableHandle handle = blas->traversable_handle;
1588 if (handle == 0) {
1589 continue;
1590 }
1591
1592 OptixInstance &instance = instances[num_instances++];
1593 memset(&instance, 0, sizeof(instance));
1594
1595 /* Clear transform to identity matrix. */
1596 instance.transform[0] = 1.0f;
1597 instance.transform[5] = 1.0f;
1598 instance.transform[10] = 1.0f;
1599
1600 /* Set user instance ID to object index. */
1601 instance.instanceId = ob->get_device_index();
1602
1603 /* Add some of the object visibility bits to the mask.
1604 * __prim_visibility contains the combined visibility bits of all instances, so is not
1605 * reliable if they differ between instances. But the OptiX visibility mask can only contain
1606 * 8 bits, so have to trade-off here and select just a few important ones.
1607 */
1608 instance.visibilityMask = ob->visibility_for_tracing() & 0xFF;
1609
1610 /* Have to have at least one bit in the mask, or else instance would always be culled. */
1611 if (0 == instance.visibilityMask) {
1612 instance.visibilityMask = 0xFF;
1613 }
1614
1615 if (ob->get_geometry()->geometry_type == Geometry::HAIR &&
1616 static_cast<const Hair *>(ob->get_geometry())->curve_shape == CURVE_THICK)
1617 {
1618 if (pipeline_options.usesMotionBlur && ob->get_geometry()->has_motion_blur()) {
1619 /* Select between motion blur and non-motion blur built-in intersection module. */
1620 instance.sbtOffset = PG_HITD_MOTION - PG_HITD;
1621 }
1622 }
1623 else if (ob->get_geometry()->geometry_type == Geometry::POINTCLOUD) {
1624 /* Use the hit group that has an intersection program for point clouds. */
1625 instance.sbtOffset = PG_HITD_POINTCLOUD - PG_HITD;
1626
1627 /* Also skip point clouds in local trace calls. */
1628 instance.visibilityMask |= 4;
1629 }
1630
1631# if OPTIX_ABI_VERSION < 55
1632 /* Cannot disable any-hit program for thick curves, since it needs to filter out end-caps. */
1633 else
1634# endif
1635 {
1636 /* Can disable __anyhit__kernel_optix_visibility_test by default (except for thick curves,
1637 * since it needs to filter out end-caps there).
1638 *
1639 * It is enabled where necessary (visibility mask exceeds 8 bits or the other any-hit
1640 * programs like __anyhit__kernel_optix_shadow_all_hit) via OPTIX_RAY_FLAG_ENFORCE_ANYHIT.
1641 */
1642 instance.flags = OPTIX_INSTANCE_FLAG_DISABLE_ANYHIT;
1643 }
1644
1645 /* Insert motion traversable if object has motion. */
1646 if (pipeline_options.usesMotionBlur && ob->use_motion()) {
1647 size_t motion_keys = max(ob->get_motion().size(), (size_t)2) - 2;
1648 size_t motion_transform_size = sizeof(OptixSRTMotionTransform) +
1649 motion_keys * sizeof(OptixSRTData);
1650
1651 const CUDAContextScope scope(this);
1652
1653 motion_transform_offset = align_up(motion_transform_offset,
1654 OPTIX_TRANSFORM_BYTE_ALIGNMENT);
1655 CUdeviceptr motion_transform_gpu = bvh_optix->motion_transform_data->device_pointer +
1656 motion_transform_offset;
1657 motion_transform_offset += motion_transform_size;
1658
1659 /* Allocate host side memory for motion transform and fill it with transform data. */
1660 OptixSRTMotionTransform &motion_transform = *reinterpret_cast<OptixSRTMotionTransform *>(
1661 new uint8_t[motion_transform_size]);
1662 motion_transform.child = handle;
1663 motion_transform.motionOptions.numKeys = ob->get_motion().size();
1664 motion_transform.motionOptions.flags = OPTIX_MOTION_FLAG_NONE;
1665 motion_transform.motionOptions.timeBegin = 0.0f;
1666 motion_transform.motionOptions.timeEnd = 1.0f;
1667
1668 OptixSRTData *const srt_data = motion_transform.srtData;
1669 array<DecomposedTransform> decomp(ob->get_motion().size());
1671 decomp.data(), ob->get_motion().data(), ob->get_motion().size());
1672
1673 for (size_t i = 0; i < ob->get_motion().size(); ++i) {
1674 /* Scale. */
1675 srt_data[i].sx = decomp[i].y.w; /* scale.x.x */
1676 srt_data[i].sy = decomp[i].z.w; /* scale.y.y */
1677 srt_data[i].sz = decomp[i].w.w; /* scale.z.z */
1678
1679 /* Shear. */
1680 srt_data[i].a = decomp[i].z.x; /* scale.x.y */
1681 srt_data[i].b = decomp[i].z.y; /* scale.x.z */
1682 srt_data[i].c = decomp[i].w.x; /* scale.y.z */
1683 assert(decomp[i].z.z == 0.0f); /* scale.y.x */
1684 assert(decomp[i].w.y == 0.0f); /* scale.z.x */
1685 assert(decomp[i].w.z == 0.0f); /* scale.z.y */
1686
1687 /* Pivot point. */
1688 srt_data[i].pvx = 0.0f;
1689 srt_data[i].pvy = 0.0f;
1690 srt_data[i].pvz = 0.0f;
1691
1692 /* Rotation. */
1693 srt_data[i].qx = decomp[i].x.x;
1694 srt_data[i].qy = decomp[i].x.y;
1695 srt_data[i].qz = decomp[i].x.z;
1696 srt_data[i].qw = decomp[i].x.w;
1697
1698 /* Translation. */
1699 srt_data[i].tx = decomp[i].y.x;
1700 srt_data[i].ty = decomp[i].y.y;
1701 srt_data[i].tz = decomp[i].y.z;
1702 }
1703
1704 /* Upload motion transform to GPU. */
1705 cuMemcpyHtoD(motion_transform_gpu, &motion_transform, motion_transform_size);
1706 delete[] reinterpret_cast<uint8_t *>(&motion_transform);
1707
1708 /* Get traversable handle to motion transform. */
1709 optixConvertPointerToTraversableHandle(context,
1710 motion_transform_gpu,
1711 OPTIX_TRAVERSABLE_TYPE_SRT_MOTION_TRANSFORM,
1712 &instance.traversableHandle);
1713 }
1714 else {
1715 instance.traversableHandle = handle;
1716
1717 if (ob->get_geometry()->is_instanced()) {
1718 /* Set transform matrix. */
1719 memcpy(instance.transform, &ob->get_tfm(), sizeof(instance.transform));
1720 }
1721 }
1722 }
1723
1724 /* Upload instance descriptions. */
1725 instances.resize(num_instances);
1726 instances.copy_to_device();
1727
1728 /* Build top-level acceleration structure (TLAS) */
1729 OptixBuildInput build_input = {};
1730 build_input.type = OPTIX_BUILD_INPUT_TYPE_INSTANCES;
1731 build_input.instanceArray.instances = instances.device_pointer;
1732 build_input.instanceArray.numInstances = num_instances;
1733
1734 if (!build_optix_bvh(bvh_optix, OPTIX_BUILD_OPERATION_BUILD, build_input, 0)) {
1735 progress.set_error("Failed to build OptiX acceleration structure");
1736 }
1737 tlas_handle = bvh_optix->traversable_handle;
1738 }
1739}
1740
1741void OptiXDevice::release_bvh(BVH *bvh)
1742{
1743 thread_scoped_lock lock(delayed_free_bvh_mutex);
1744 /* Do delayed free of BVH memory, since geometry holding BVH might be deleted
1745 * while GPU is still rendering. */
1746 BVHOptiX *const bvh_optix = static_cast<BVHOptiX *>(bvh);
1747
1748 delayed_free_bvh_memory.emplace_back(std::move(bvh_optix->as_data));
1749 delayed_free_bvh_memory.emplace_back(std::move(bvh_optix->motion_transform_data));
1750 bvh_optix->traversable_handle = 0;
1751}
1752
1753void OptiXDevice::free_bvh_memory_delayed()
1754{
1755 thread_scoped_lock lock(delayed_free_bvh_mutex);
1756 delayed_free_bvh_memory.free_memory();
1757}
1758
1759void OptiXDevice::const_copy_to(const char *name, void *host, size_t size)
1760{
1761 /* Set constant memory for CUDA module. */
1762 CUDADevice::const_copy_to(name, host, size);
1763
1764 if (strcmp(name, "data") == 0) {
1765 assert(size <= sizeof(KernelData));
1766
1767 /* Update traversable handle (since it is different for each device on multi devices). */
1768 KernelData *const data = (KernelData *)host;
1769 *(OptixTraversableHandle *)&data->device_bvh = tlas_handle;
1770
1771 update_launch_params(offsetof(KernelParamsOptiX, data), host, size);
1772 return;
1773 }
1774
1775 /* Update data storage pointers in launch parameters. */
1776# define KERNEL_DATA_ARRAY(data_type, data_name) \
1777 if (strcmp(name, #data_name) == 0) { \
1778 update_launch_params(offsetof(KernelParamsOptiX, data_name), host, size); \
1779 return; \
1780 }
1781 KERNEL_DATA_ARRAY(IntegratorStateGPU, integrator_state)
1782# include "kernel/data_arrays.h"
1783# undef KERNEL_DATA_ARRAY
1784}
1785
1786void OptiXDevice::update_launch_params(size_t offset, void *data, size_t data_size)
1787{
1788 const CUDAContextScope scope(this);
1789
1790 cuda_assert(cuMemcpyHtoD(launch_params.device_pointer + offset, data, data_size));
1791}
1792
1794
1795#endif /* WITH_OPTIX */
unsigned int uint
ThreadMutex mutex
in reality light always falls off quadratically Particle Retrieve the data of the particle that spawned the object instance
in reality light always falls off quadratically Particle Retrieve the data of the particle that spawned the object for example to give variation to multiple instances of an object Point Retrieve information about points in a point cloud Retrieve the edges of an object as it appears to Cycles topology will always appear triangulated Convert a blackbody temperature to an RGB value Normal Generate a perturbed normal from an RGB normal map image Typically used for faking highly detailed surfaces Generate an OSL shader from a file or text data block Image Sample an image file as a texture Gabor Generate Gabor noise Gradient Generate interpolated color and intensity values based on the input vector Magic Generate a psychedelic color texture Voronoi Generate Worley noise based on the distance to random points Typically used to generate textures such as or biological cells Brick Generate a procedural texture producing bricks Texture Retrieve multiple types of texture coordinates nTypically used as inputs for texture nodes Vector Convert a point
volatile int lock
ATTR_WARN_UNUSED_RESULT const BMVert * v
static DBVT_INLINE btScalar size(const btDbvtVolume &a)
Definition btDbvt.cpp:52
static btDbvtVolume bounds(btDbvtNode **leaves, int count)
Definition btDbvt.cpp:299
void refit(btStridingMeshInterface *triangles, const btVector3 &aabbMin, const btVector3 &aabbMax)
SIMD_FORCE_INLINE const btScalar & z() const
Return the z value.
Definition btQuadWord.h:117
SIMD_FORCE_INLINE const btScalar & w() const
Return the w value.
Definition btQuadWord.h:119
Attribute * find(ustring name) const
float3 * data_float3()
float4 * data_float4()
bool top_level
Definition params.h:81
int bvh_type
Definition params.h:106
Definition bvh/bvh.h:66
vector< Geometry * > geometry
Definition bvh/bvh.h:69
BVHParams params
Definition bvh/bvh.h:68
vector< Object * > objects
Definition bvh/bvh.h:70
Type geometry_type
size_t prim_offset
AttributeSet attributes
Definition hair.h:14
Curve get_curve(size_t i) const
Definition hair.h:112
size_t curve_segment_offset
Definition hair.h:91
size_t num_curves() const
Definition hair.h:126
size_t num_segments() const
Definition hair.h:131
CurveShapeType curve_shape
Definition hair.h:92
size_t num_keys() const
Definition hair.h:121
void set_substatus(const string &substatus_)
Definition progress.h:274
void set_error(const string &error_message_)
Definition progress.h:113
void alloc_to_device(size_t num, bool shrink_to_fit=true)
additional_info("compositor_sum_squared_difference_float_shared") .push_constant(Type output_img float dot(value.rgb, luminance_coefficients)") .define("LOAD(value)"
@ MEM_READ_ONLY
CCL_NAMESPACE_BEGIN struct Options options
#define KERNEL_DATA_ARRAY(type, name)
Definition data_arrays.h:6
DebugFlags & DebugFlags()
Definition debug.h:142
#define function_bind
#define CCL_NAMESPACE_END
ccl_device_forceinline float4 make_float4(const float x, const float y, const float z, const float w)
#define NULL
#define offsetof(t, d)
draw_view push_constant(Type::INT, "radiance_src") .push_constant(Type capture_info_buf storage_buf(1, Qualifier::READ, "ObjectBounds", "bounds_buf[]") .push_constant(Type draw_view int
static float verts[][3]
@ SHADER_TYPE_BUMP
@ SHADER_TYPE_SURFACE
@ SHADER_TYPE_VOLUME
@ SHADER_TYPE_DISPLACEMENT
#define KERNEL_FEATURE_OBJECT_MOTION
@ ATTR_STD_MOTION_VERTEX_POSITION
#define KERNEL_FEATURE_OSL
@ CURVE_THICK
#define KERNEL_FEATURE_SUBSURFACE
KernelData
#define KERNEL_FEATURE_HAIR_THICK
@ BVH_LAYOUT_OPTIX
#define KERNEL_FEATURE_PATH_TRACING
#define KERNEL_FEATURE_HAIR
#define KERNEL_FEATURE_NODE_RAYTRACE
#define KERNEL_FEATURE_BAKING
#define KERNEL_FEATURE_MNEE
#define KERNEL_FEATURE_POINTCLOUD
#define VLOG_INFO
Definition log.h:72
#define VLOG_IS_ON(severity)
Definition log.h:36
Segment< FEdge *, Vec3r > segment
struct blender::compositor::@345301070213251227305337367154215234324277345027::@113305264211110136200164070253045215160301331207 task
T step(const T &edge, const T &value)
void index(const bNode &, void *r_value)
int BVHLayoutMask
Definition params.h:51
@ BVH_TYPE_STATIC
Definition params.h:41
size_t path_file_size(const string &path)
Definition path.cpp:556
bool path_is_directory(const string &path)
Definition path.cpp:584
string path_get(const string &sub)
Definition path.cpp:339
string path_join(const string &dir, const string &file)
Definition path.cpp:417
bool path_read_compressed_text(const string &path, string &text)
Definition path.cpp:754
static struct PyModuleDef module
Definition python.cpp:991
#define min(a, b)
Definition sort.c:32
unsigned short uint16_t
Definition stdint.h:79
unsigned char uint8_t
Definition stdint.h:78
unsigned __int64 uint64_t
Definition stdint.h:90
CCL_NAMESPACE_BEGIN string string_printf(const char *format,...)
Definition string.cpp:23
void bounds_grow(const int k, const float3 *curve_keys, const float *curve_radius, BoundBox &bounds) const
Definition hair.cpp:42
int first_key
Definition hair.h:20
int num_segments() const
Definition hair.h:23
int num_keys
Definition hair.h:21
size_t num_triangles() const
Definition scene/mesh.h:80
bool use_motion() const
int get_device_index() const
bool is_traceable() const
uint visibility_for_tracing() const
Point get_point(int i) const
size_t num_points() const
void push(TaskRunFunction &&task)
Definition task.cpp:22
void wait_work(Summary *stats=NULL)
Definition task.cpp:28
VecBase< float, 4 > float4
std::unique_lock< std::mutex > thread_scoped_lock
Definition thread.h:30
CCL_NAMESPACE_BEGIN typedef std::mutex thread_mutex
Definition thread.h:29
void transform_motion_decompose(DecomposedTransform *decomp, const Transform *motion, size_t size)
float max
ccl_device_inline size_t align_up(size_t offset, size_t alignment)
Definition util/types.h:48
uint64_t device_ptr
Definition util/types.h:45