9# include <sycl/sycl.hpp>
15# ifdef WITH_EMBREE_GPU
19# if defined(WITH_OPENIMAGEDENOISE)
20# include <OpenImageDenoise/config.h>
21# if OIDN_VERSION >= 20300
29# if defined(WITH_EMBREE_GPU) && defined(EMBREE_SYCL_SUPPORT) && !defined(SYCL_LANGUAGE_VERSION)
32extern "C" RTCDevice rtcNewSYCLDevice(sycl::context context,
const char *config);
33extern "C" bool rtcIsSYCLDeviceSupported(
const sycl::device sycl_device);
38static std::vector<sycl::device> available_sycl_devices(
bool *multiple_dgpus_detected);
39static int parse_driver_build_version(
const sycl::device &device);
41static void queue_error_cb(
const char *message,
void *user_ptr)
44 *
reinterpret_cast<std::string *
>(user_ptr) = message;
49 :
GPUDevice(info, stats, profiler, headless)
52 static_assert(
sizeof(texMemObject) ==
53 sizeof(sycl::ext::oneapi::experimental::sampled_image_handle));
54 static_assert(
sizeof(arrayMemObject) ==
55 sizeof(sycl::ext::oneapi::experimental::image_mem_handle));
57 need_texture_info =
false;
60 oneapi_set_error_cb(queue_error_cb, &oneapi_error_string_);
62 bool is_finished_ok = create_queue(device_queue_,
64# ifdef WITH_EMBREE_GPU
65 use_hardware_raytracing ? (
void *)&embree_device :
nullptr,
69 &is_several_intel_dgpu_devices_detected);
71 if (is_finished_ok ==
false) {
72 set_error(
"oneAPI queue initialization error: got runtime exception \"" +
73 oneapi_error_string_ +
"\"");
76 VLOG_DEBUG <<
"oneAPI queue has been successfully created for the device \""
81# ifdef WITH_EMBREE_GPU
82 use_hardware_raytracing = use_hardware_raytracing && (embree_device !=
nullptr);
84 use_hardware_raytracing =
false;
87 if (use_hardware_raytracing) {
88 VLOG_INFO <<
"oneAPI will use hardware ray tracing for intersection acceleration.";
91 size_t globals_segment_size;
92 is_finished_ok = kernel_globals_size(globals_segment_size);
93 if (is_finished_ok ==
false) {
94 set_error(
"oneAPI constant memory initialization got runtime exception \"" +
95 oneapi_error_string_ +
"\"");
98 VLOG_DEBUG <<
"Successfully created global/constant memory segment (kernel globals object)";
101 kg_memory_ = usm_aligned_alloc_host(device_queue_, globals_segment_size, 16);
102 usm_memset(device_queue_, kg_memory_, 0, globals_segment_size);
104 kg_memory_device_ = usm_alloc_device(device_queue_, globals_segment_size);
106 kg_memory_size_ = globals_segment_size;
108 max_memory_on_device_ = get_memcapacity();
112 const char *headroom_str = getenv(
"CYCLES_ONEAPI_MEMORY_HEADROOM");
113 if (headroom_str !=
nullptr) {
114 const long long override_headroom = (float)atoll(headroom_str);
115 device_working_headroom = override_headroom;
116 device_texture_headroom = override_headroom;
122OneapiDevice::~OneapiDevice()
124# ifdef WITH_EMBREE_GPU
126 rtcReleaseDevice(embree_device);
131 usm_free(device_queue_, kg_memory_);
132 usm_free(device_queue_, kg_memory_device_);
134 const_mem_map_.clear();
137 free_queue(device_queue_);
141bool OneapiDevice::check_peer_access(
Device * )
146bool OneapiDevice::can_use_hardware_raytracing_for_features(
const uint requested_features)
const
150# if defined(RTC_VERSION) && RTC_VERSION < 40100
153 (void)requested_features;
158BVHLayoutMask OneapiDevice::get_bvh_layout_mask(
const uint requested_features)
const
160 return (use_hardware_raytracing &&
161 can_use_hardware_raytracing_for_features(requested_features)) ?
166# ifdef WITH_EMBREE_GPU
170 BVHEmbree *
const bvh_embree =
static_cast<BVHEmbree *
>(bvh);
175 bvh_embree->build(
progress, &stats, embree_device,
true);
178# if RTC_VERSION >= 40302
180 all_embree_scenes.push_back(bvh_embree->scene);
184# if RTC_VERSION >= 40400
185 embree_traversable = rtcGetSceneTraversable(bvh_embree->scene);
187 embree_traversable = bvh_embree->scene;
189# if RTC_VERSION >= 40302
190 RTCError error_code = bvh_embree->offload_scenes_to_gpu(all_embree_scenes);
191 if (error_code != RTC_ERROR_NONE) {
193 string_printf(
"BVH failed to migrate to the GPU due to Embree library error (%s)",
194 bvh_embree->get_error_string(error_code)));
196 all_embree_scenes.clear();
206size_t OneapiDevice::get_free_mem()
const
212 const sycl::device &device =
reinterpret_cast<sycl::queue *
>(device_queue_)->get_device();
213 const bool is_integrated_gpu = device.get_info<sycl::info::device::host_unified_memory>();
214 if (device.has(sycl::aspect::ext_intel_free_memory) && is_integrated_gpu ==
false) {
215 return device.get_info<sycl::ext::intel::info::device::free_memory>();
218 if (device_mem_in_use < max_memory_on_device_) {
219 return max_memory_on_device_ - device_mem_in_use;
224bool OneapiDevice::load_kernels(
const uint requested_features)
234 kernel_features |= requested_features;
236 bool is_finished_ok = oneapi_run_test_kernel(device_queue_);
237 if (is_finished_ok ==
false) {
238 set_error(
"oneAPI test kernel execution: got a runtime exception \"" + oneapi_error_string_ +
245 if (use_hardware_raytracing && !can_use_hardware_raytracing_for_features(requested_features)) {
247 <<
"Hardware ray tracing disabled, not supported yet by oneAPI for requested features.";
248 use_hardware_raytracing =
false;
251 is_finished_ok = oneapi_load_kernels(
252 device_queue_, (
const unsigned int)requested_features, use_hardware_raytracing);
253 if (is_finished_ok ==
false) {
254 set_error(
"oneAPI kernels loading: got a runtime exception \"" + oneapi_error_string_ +
"\"");
260 if (is_finished_ok) {
261 reserve_private_memory(requested_features);
262 is_finished_ok = !have_error();
265 return is_finished_ok;
268void OneapiDevice::reserve_private_memory(
const uint kernel_features)
270 size_t free_before = get_free_mem();
287 queue->init_execution();
291 queue->enqueue(test_kernel, 1, args);
292 queue->synchronize();
295 size_t free_after = get_free_mem();
297 VLOG_INFO <<
"For kernel execution were reserved "
302void OneapiDevice::get_device_memory_info(
size_t &total,
size_t &
free)
304 free = get_free_mem();
305 total = max_memory_on_device_;
308bool OneapiDevice::alloc_device(
void *&device_pointer,
const size_t size)
310 bool allocation_success =
false;
311 device_pointer = usm_alloc_device(device_queue_,
size);
312 if (device_pointer !=
nullptr) {
313 allocation_success =
true;
316 if (!oneapi_zero_memory_on_device(device_queue_, device_pointer,
size)) {
317 set_error(
"oneAPI memory operation error: got runtime exception \"" + oneapi_error_string_ +
319 usm_free(device_queue_, device_pointer);
321 device_pointer =
nullptr;
322 allocation_success =
false;
326 return allocation_success;
329void OneapiDevice::free_device(
void *device_pointer)
331 usm_free(device_queue_, device_pointer);
334bool OneapiDevice::shared_alloc(
void *&shared_pointer,
const size_t size)
336 shared_pointer = usm_aligned_alloc_host(device_queue_,
size, 64);
337 return shared_pointer !=
nullptr;
340void OneapiDevice::shared_free(
void *shared_pointer)
342 usm_free(device_queue_, shared_pointer);
345void *OneapiDevice::shared_to_device_pointer(
const void *shared_pointer)
349 return const_cast<void *
>(shared_pointer);
352void OneapiDevice::copy_host_to_device(
void *device_pointer,
void *host_pointer,
const size_t size)
354 usm_memcpy(device_queue_, device_pointer, host_pointer,
size);
358SyclQueue *OneapiDevice::sycl_queue()
360 return device_queue_;
363string OneapiDevice::oneapi_error_message()
365 return string(oneapi_error_string_);
368int OneapiDevice::scene_max_shaders()
370 return scene_max_shaders_;
373void *OneapiDevice::kernel_globals_device_pointer()
375 return kg_memory_device_;
378void *OneapiDevice::host_alloc(
const MemoryType type,
const size_t size)
382# ifdef SYCL_EXT_ONEAPI_COPY_OPTIMIZE
385 if (is_several_intel_dgpu_devices_detected ==
false && host_pointer) {
388 sycl::queue *queue =
reinterpret_cast<sycl::queue *
>(device_queue_);
391 if (queue->get_backend() == sycl::backend::ext_oneapi_level_zero) {
392 sycl::ext::oneapi::experimental::prepare_for_device_copy(host_pointer,
size, *queue);
401void OneapiDevice::host_free(
const MemoryType type,
void *host_pointer,
const size_t size)
403# ifdef SYCL_EXT_ONEAPI_COPY_OPTIMIZE
404 if (is_several_intel_dgpu_devices_detected ==
false) {
406 sycl::queue *queue =
reinterpret_cast<sycl::queue *
>(device_queue_);
409 if (queue->get_backend() == sycl::backend::ext_oneapi_level_zero) {
410 sycl::ext::oneapi::experimental::release_from_device_copy(host_pointer, *queue);
422 assert(!
"mem_alloc not supported for textures.");
425 assert(!
"mem_alloc not supported for global memory.");
440 VLOG_DEBUG <<
"OneapiDevice::mem_copy_to: \"" << mem.
name <<
"\", "
461 generic_copy_to(mem);
468 VLOG_DEBUG <<
"OneapiDevice::mem_move_to_host: \"" << mem.
name <<
"\", "
492void OneapiDevice::mem_copy_from(
493 device_memory &mem,
const size_t y,
size_t w,
const size_t h,
size_t elem)
496 assert(!
"mem_copy_from not supported for textures.");
499 const size_t size = (
w > 0 || h > 0 || elem > 0) ? (elem *
w * h) : mem.
memory_size();
500 const size_t offset = elem *
y *
w;
503 VLOG_DEBUG <<
"OneapiDevice::mem_copy_from: \"" << mem.
name <<
"\" object of "
506 <<
" data " <<
size <<
" bytes";
519 char *shifted_host =
reinterpret_cast<char *
>(mem.
host_pointer) + offset;
520 char *shifted_device =
reinterpret_cast<char *
>(mem.
device_pointer) + offset;
521 bool is_finished_ok = usm_memcpy(device_queue_, shifted_host, shifted_device,
size);
522 if (is_finished_ok ==
false) {
523 set_error(
"oneAPI memory operation error: got runtime exception \"" +
524 oneapi_error_string_ +
"\"");
552 bool is_finished_ok = usm_memset(
554 if (is_finished_ok ==
false) {
555 set_error(
"oneAPI memory operation error: got runtime exception \"" + oneapi_error_string_ +
587void OneapiDevice::const_copy_to(
const char *name,
void *host,
const size_t size)
591 VLOG_DEBUG <<
"OneapiDevice::const_copy_to \"" << name <<
"\" object "
595 if (strcmp(name,
"data") == 0) {
597 KernelData *
const data =
static_cast<KernelData *
>(host);
601 scene_max_shaders_ =
data->max_shaders;
603# ifdef WITH_EMBREE_GPU
604 if (embree_traversable !=
nullptr) {
608 data->device_bvh = embree_traversable;
613 ConstMemMap::iterator
i = const_mem_map_.find(name);
616 if (
i == const_mem_map_.end()) {
619 data_ptr->alloc(
size);
620 data = data_ptr.get();
621 const_mem_map_.insert(ConstMemMap::value_type(name, std::move(data_ptr)));
624 data =
i->second.get();
629 data->copy_to_device();
631 set_global_memory(device_queue_, kg_memory_, name, (
void *)
data->device_pointer);
633 usm_memcpy(device_queue_, kg_memory_device_, kg_memory_, kg_memory_size_);
641 VLOG_DEBUG <<
"OneapiDevice::global_alloc \"" << mem.
name <<
"\" object "
646 generic_copy_to(mem);
650 usm_memcpy(device_queue_, kg_memory_device_, kg_memory_, kg_memory_size_);
659 generic_copy_to(mem);
670static sycl::ext::oneapi::experimental::image_descriptor image_desc(
const device_texture &mem)
673 sycl::image_channel_type channel_type;
677 channel_type = sycl::image_channel_type::unorm_int8;
680 channel_type = sycl::image_channel_type::unorm_int16;
683 channel_type = sycl::image_channel_type::fp32;
686 channel_type = sycl::image_channel_type::fp16;
692 sycl::ext::oneapi::experimental::image_descriptor param;
697 param.channel_type = channel_type;
710 sycl::addressing_mode address_mode = sycl::addressing_mode::none;
713 address_mode = sycl::addressing_mode::repeat;
716 address_mode = sycl::addressing_mode::clamp_to_edge;
719 address_mode = sycl::addressing_mode::clamp;
722 address_mode = sycl::addressing_mode::mirrored_repeat;
729 sycl::filtering_mode filter_mode;
731 filter_mode = sycl::filtering_mode::nearest;
734 filter_mode = sycl::filtering_mode::linear;
738 sycl::image_channel_type channel_type;
742 channel_type = sycl::image_channel_type::unorm_int8;
745 channel_type = sycl::image_channel_type::unorm_int16;
748 channel_type = sycl::image_channel_type::fp32;
751 channel_type = sycl::image_channel_type::fp16;
758 sycl::queue *queue =
reinterpret_cast<sycl::queue *
>(device_queue_);
762 sycl::ext::oneapi::experimental::image_mem_handle memHandle{0};
763 sycl::ext::oneapi::experimental::image_descriptor desc{};
766 const sycl::device &device =
reinterpret_cast<sycl::queue *
>(queue)->get_device();
768 const size_t max_width = device.get_info<sycl::info::device::image3d_max_width>();
769 const size_t max_height = device.get_info<sycl::info::device::image3d_max_height>();
770 const size_t max_depth = device.get_info<sycl::info::device::image3d_max_depth>();
776 "Maximum GPU 3D texture size exceeded (max %zux%zux%zu, found %zux%zux%zu)",
787 const size_t max_width = device.get_info<sycl::info::device::image2d_max_width>();
788 const size_t max_height = device.get_info<sycl::info::device::image2d_max_height>();
792 string_printf(
"Maximum GPU 2D texture size exceeded (max %zux%zu, found %zux%zu)",
803 desc = sycl::ext::oneapi::experimental::image_descriptor(
810 sycl::ext::oneapi::experimental::image_mem_handle memHandle =
811 sycl::ext::oneapi::experimental::alloc_image_mem(desc, *queue);
812 if (!memHandle.raw_handle) {
813 set_error(
"GPU texture allocation failed: Raw handle is null");
818 queue->ext_oneapi_copy(mem.
host_pointer, memHandle, desc);
825 cmem = &device_mem_map[&mem];
827 cmem->array = (arrayMemObject)(memHandle.raw_handle);
831 desc = sycl::ext::oneapi::experimental::image_descriptor(
833 cmem = generic_alloc(mem);
841 queue->wait_and_throw();
846 sycl::ext::oneapi::experimental::bindless_image_sampler samp(
847 address_mode, sycl::coordinate_normalization_mode::normalized, filter_mode);
854 sycl::ext::oneapi::experimental::sampled_image_handle imgHandle;
856 if (memHandle.raw_handle) {
858 imgHandle = sycl::ext::oneapi::experimental::create_image(memHandle, samp, desc, *queue);
862 imgHandle = sycl::ext::oneapi::experimental::create_image(
867 cmem = &device_mem_map[&mem];
868 cmem->texobject = (texMemObject)(imgHandle.raw_handle);
880 if (slot >= texture_info.size()) {
882 texture_info.resize(slot + 128);
884 texture_info[slot] = tex_info;
885 need_texture_info =
true;
888 catch (sycl::exception
const &
e) {
889 set_error(
"GPU texture allocation failed: runtime exception \"" +
string(
e.what()) +
"\"");
901 sycl::ext::oneapi::experimental::image_descriptor desc = image_desc(mem);
903 sycl::queue *queue =
reinterpret_cast<sycl::queue *
>(device_queue_);
908 const Mem &cmem = device_mem_map[&mem];
909 sycl::ext::oneapi::experimental::image_mem_handle image_handle{
910 (sycl::ext::oneapi::experimental::image_mem_handle::raw_handle_type)cmem.array};
911 queue->ext_oneapi_copy(mem.
host_pointer, image_handle, desc);
913# ifdef WITH_CYCLES_DEBUG
914 queue->wait_and_throw();
917 catch (sycl::exception
const &
e) {
918 set_error(
"oneAPI texture copy error: got runtime exception \"" +
string(
e.what()) +
"\"");
922 generic_copy_to(mem);
931 DCHECK(device_mem_map.find(&mem) != device_mem_map.end());
932 const Mem &cmem = device_mem_map[&mem];
934 sycl::queue *queue =
reinterpret_cast<sycl::queue *
>(device_queue_);
936 if (cmem.texobject) {
938 sycl::ext::oneapi::experimental::sampled_image_handle image(cmem.texobject);
939 sycl::ext::oneapi::experimental::destroy_image_handle(image, *queue);
944 sycl::ext::oneapi::experimental::image_mem_handle imgHandle{
945 (sycl::ext::oneapi::experimental::image_mem_handle::raw_handle_type)cmem.array};
949 sycl::ext::oneapi::experimental::free_image_mem(
950 imgHandle, sycl::ext::oneapi::experimental::image_type::standard, *queue);
952 catch (sycl::exception
const &
e) {
953 set_error(
"oneAPI texture deallocation error: got runtime exception \"" +
954 string(
e.what()) +
"\"");
960 device_mem_map.erase(device_mem_map.find(&mem));
971 return make_unique<OneapiDeviceQueue>(
this);
982void *OneapiDevice::usm_aligned_alloc_host(
const size_t memory_size,
const size_t alignment)
985 return usm_aligned_alloc_host(device_queue_, memory_size, alignment);
988void OneapiDevice::usm_free(
void *usm_ptr)
991 usm_free(device_queue_, usm_ptr);
994void OneapiDevice::check_usm(SyclQueue *queue_,
const void *usm_ptr,
bool allow_host =
false)
997 sycl::queue *queue =
reinterpret_cast<sycl::queue *
>(queue_);
998 sycl::info::device_type device_type =
999 queue->get_device().get_info<sycl::info::device::device_type>();
1000 sycl::usm::alloc usm_type =
get_pointer_type(usm_ptr, queue->get_context());
1002# ifndef WITH_ONEAPI_SYCL_HOST_TASK
1003 const sycl::usm::alloc main_memory_type = sycl::usm::alloc::device;
1005 const sycl::usm::alloc main_memory_type = sycl::usm::alloc::host;
1007 assert(usm_type == main_memory_type ||
1008 (usm_type == sycl::usm::alloc::host &&
1009 (allow_host || device_type == sycl::info::device_type::cpu)) ||
1010 usm_type == sycl::usm::alloc::unknown);
1019bool OneapiDevice::create_queue(SyclQueue *&external_queue,
1020 const int device_index,
1021 void *embree_device_pointer,
1022 bool *is_several_intel_dgpu_devices_detected_pointer)
1024 bool finished_correct =
true;
1025 *is_several_intel_dgpu_devices_detected_pointer =
false;
1028 std::vector<sycl::device> devices = available_sycl_devices(
1029 is_several_intel_dgpu_devices_detected_pointer);
1030 if (device_index < 0 || device_index >= devices.size()) {
1034 sycl::queue *created_queue =
nullptr;
1035 if (*is_several_intel_dgpu_devices_detected_pointer ==
false) {
1036 created_queue =
new sycl::queue(devices[device_index], sycl::property::queue::in_order());
1039 sycl::context device_context(devices[device_index]);
1040 created_queue =
new sycl::queue(
1041 device_context, devices[device_index], sycl::property::queue::in_order());
1042 VLOG_DEBUG <<
"Separate context was generated for the new queue, as several available SYCL "
1043 "devices were detected";
1045 external_queue =
reinterpret_cast<SyclQueue *
>(created_queue);
1047# ifdef WITH_EMBREE_GPU
1048 if (embree_device_pointer) {
1049 RTCDevice *device_object_ptr =
reinterpret_cast<RTCDevice *
>(embree_device_pointer);
1050 *device_object_ptr = rtcNewSYCLDevice(created_queue->get_context(),
"");
1051 if (*device_object_ptr ==
nullptr) {
1052 finished_correct =
false;
1053 oneapi_error_string_ =
1054 "Hardware Raytracing is not available; please install "
1055 "\"intel-level-zero-gpu-raytracing\" to enable it or disable Embree on GPU.";
1059 (void)embree_device_pointer;
1062 catch (
const sycl::exception &
e) {
1063 finished_correct =
false;
1064 oneapi_error_string_ =
e.what();
1066 return finished_correct;
1069void OneapiDevice::free_queue(SyclQueue *queue_)
1072 sycl::queue *queue =
reinterpret_cast<sycl::queue *
>(queue_);
1076void *OneapiDevice::usm_aligned_alloc_host(SyclQueue *queue_,
1078 const size_t alignment)
1081 sycl::queue *queue =
reinterpret_cast<sycl::queue *
>(queue_);
1082 return sycl::aligned_alloc_host(alignment, memory_size, *queue);
1085void *OneapiDevice::usm_alloc_device(SyclQueue *queue_,
size_t memory_size)
1088 sycl::queue *queue =
reinterpret_cast<sycl::queue *
>(queue_);
1099# ifndef WITH_ONEAPI_SYCL_HOST_TASK
1100 return sycl::malloc_device(memory_size, *queue);
1102 return sycl::malloc_host(memory_size, *queue);
1106void OneapiDevice::usm_free(SyclQueue *queue_,
void *usm_ptr)
1109 sycl::queue *queue =
reinterpret_cast<sycl::queue *
>(queue_);
1110 OneapiDevice::check_usm(queue_, usm_ptr,
true);
1111 sycl::free(usm_ptr, *queue);
1114bool OneapiDevice::usm_memcpy(SyclQueue *queue_,
void *dest,
void *src,
const size_t num_bytes)
1124 sycl::queue *queue =
reinterpret_cast<sycl::queue *
>(queue_);
1125 OneapiDevice::check_usm(queue_, dest,
true);
1126 OneapiDevice::check_usm(queue_, src,
true);
1131 if ((dest_type == sycl::usm::alloc::host || dest_type == sycl::usm::alloc::unknown) &&
1132 (src_type == sycl::usm::alloc::host || src_type == sycl::usm::alloc::unknown))
1134 memcpy(dest, src, num_bytes);
1139 sycl::event mem_event = queue->memcpy(dest, src, num_bytes);
1140# ifdef WITH_CYCLES_DEBUG
1144 mem_event.wait_and_throw();
1147 bool from_device_to_host = dest_type == sycl::usm::alloc::host &&
1148 src_type == sycl::usm::alloc::device;
1149 bool host_or_device_memop_with_offset = dest_type == sycl::usm::alloc::unknown ||
1150 src_type == sycl::usm::alloc::unknown;
1154 if (from_device_to_host || host_or_device_memop_with_offset) {
1160 catch (
const sycl::exception &
e) {
1161 oneapi_error_string_ =
e.what();
1166bool OneapiDevice::usm_memset(SyclQueue *queue_,
1168 unsigned char value,
1169 const size_t num_bytes)
1179 sycl::queue *queue =
reinterpret_cast<sycl::queue *
>(queue_);
1180 OneapiDevice::check_usm(queue_, usm_ptr,
true);
1182 sycl::event mem_event = queue->memset(usm_ptr, value, num_bytes);
1183# ifdef WITH_CYCLES_DEBUG
1187 mem_event.wait_and_throw();
1193 catch (
const sycl::exception &
e) {
1194 oneapi_error_string_ =
e.what();
1199bool OneapiDevice::queue_synchronize(SyclQueue *queue_)
1202 sycl::queue *queue =
reinterpret_cast<sycl::queue *
>(queue_);
1204 queue->wait_and_throw();
1207 catch (
const sycl::exception &
e) {
1208 oneapi_error_string_ =
e.what();
1213bool OneapiDevice::kernel_globals_size(
size_t &kernel_global_size)
1220void OneapiDevice::set_global_memory(SyclQueue *queue_,
1221 void *kernel_globals,
1222 const char *memory_name,
1223 void *memory_device_pointer)
1228 assert(memory_device_pointer);
1230 OneapiDevice::check_usm(queue_, memory_device_pointer,
true);
1231 OneapiDevice::check_usm(queue_, kernel_globals,
true);
1233 std::string matched_name(memory_name);
1236# define KERNEL_DATA_ARRAY(type, name) \
1237 else if (#name == matched_name) { \
1238 globals->__##name = (type *)memory_device_pointer; \
1243 else if (
"integrator_state" == matched_name) {
1248# include "kernel/data_arrays.h"
1250 std::cerr <<
"Can't found global/constant memory with name \"" << matched_name <<
"\"!"
1254# undef KERNEL_DATA_ARRAY
1257bool OneapiDevice::enqueue_kernel(KernelContext *kernel_context,
1259 const size_t global_size,
1260 const size_t local_size,
1263 return oneapi_enqueue_kernel(kernel_context,
1268 use_hardware_raytracing,
1272void OneapiDevice::get_adjusted_global_and_local_sizes(SyclQueue *queue,
1274 size_t &kernel_global_size,
1275 size_t &kernel_local_size)
1278 static const size_t preferred_work_group_size_intersect = 128;
1279 static const size_t preferred_work_group_size_shading = 256;
1280 static const size_t preferred_work_group_size_shading_simd8 = 64;
1283 static const size_t preferred_work_group_size_shader_evaluation = 256;
1286 static const size_t preferred_work_group_size_cryptomatte = 512;
1287 static const size_t preferred_work_group_size_default = 1024;
1289 const sycl::device &device =
reinterpret_cast<sycl::queue *
>(queue)->get_device();
1290 const size_t max_work_group_size = device.get_info<sycl::info::device::max_work_group_size>();
1292 size_t preferred_work_group_size = 0;
1301 preferred_work_group_size = preferred_work_group_size_intersect;
1312 const bool device_is_simd8 =
1313 (device.has(sycl::aspect::ext_intel_gpu_eu_simd_width) &&
1314 device.get_info<sycl::ext::intel::info::device::gpu_eu_simd_width>() == 8);
1315 preferred_work_group_size = (device_is_simd8) ? preferred_work_group_size_shading_simd8 :
1316 preferred_work_group_size_shading;
1320 preferred_work_group_size = preferred_work_group_size_cryptomatte;
1326 preferred_work_group_size = preferred_work_group_size_shader_evaluation;
1336 if (preferred_work_group_size == 0) {
1337 preferred_work_group_size = oneapi_suggested_gpu_kernel_size((
::DeviceKernel)kernel);
1341 if (preferred_work_group_size == 0) {
1342 preferred_work_group_size = preferred_work_group_size_default;
1345 kernel_local_size = std::min(max_work_group_size, preferred_work_group_size);
1349 kernel_global_size =
round_up(kernel_global_size, kernel_local_size);
1351# ifdef WITH_ONEAPI_SYCL_HOST_TASK
1362 kernel_global_size = 1;
1363 kernel_local_size = 1;
1367 assert(kernel_global_size % kernel_local_size == 0);
1372static const int lowest_supported_driver_version_win = 1016554;
1376static const int lowest_supported_driver_version_neo = 31896;
1378static const int lowest_supported_driver_version_neo = 31740;
1381int parse_driver_build_version(
const sycl::device &device)
1383 const std::string &driver_version = device.get_info<sycl::info::device::driver_version>();
1384 int driver_build_version = 0;
1386 size_t second_dot_position = driver_version.find(
'.', driver_version.find(
'.') + 1);
1387 if (second_dot_position != std::string::npos) {
1389 size_t third_dot_position = driver_version.find(
'.', second_dot_position + 1);
1390 if (third_dot_position != std::string::npos) {
1391 const std::string &third_number_substr = driver_version.substr(
1392 second_dot_position + 1, third_dot_position - second_dot_position - 1);
1393 const std::string &forth_number_substr = driver_version.substr(third_dot_position + 1);
1394 if (third_number_substr.length() == 3 && forth_number_substr.length() == 4) {
1395 driver_build_version = std::stoi(third_number_substr) * 10000 +
1396 std::stoi(forth_number_substr);
1402 else if (third_number_substr.length() == 5 && forth_number_substr.length() == 6) {
1403 driver_build_version = std::stoi(third_number_substr);
1407 const std::string &third_number_substr = driver_version.substr(second_dot_position + 1);
1408 driver_build_version = std::stoi(third_number_substr);
1411 catch (std::invalid_argument &) {
1415 if (driver_build_version == 0) {
1416 VLOG_WARNING <<
"Unable to parse unknown Intel GPU driver version. \"" << driver_version
1417 <<
"\" does not match xx.xx.xxxxx (Linux), x.x.xxxx (L0),"
1418 <<
" xx.xx.xxx.xxxx (Windows) for device \""
1419 << device.get_info<sycl::info::device::name>() <<
"\".";
1422 return driver_build_version;
1425std::vector<sycl::device> available_sycl_devices(
bool *multiple_dgpus_detected =
nullptr)
1427 std::vector<sycl::device> available_devices;
1428 bool allow_all_devices =
false;
1429 if (getenv(
"CYCLES_ONEAPI_ALL_DEVICES") !=
nullptr) {
1430 allow_all_devices =
true;
1433 int level_zero_dgpu_counter = 0;
1435 const std::vector<sycl::platform> &oneapi_platforms = sycl::platform::get_platforms();
1437 for (
const sycl::platform &platform : oneapi_platforms) {
1441 if (platform.get_backend() == sycl::backend::opencl) {
1445 const std::vector<sycl::device> &oneapi_devices =
1446 (allow_all_devices) ? platform.get_devices(sycl::info::device_type::all) :
1447 platform.get_devices(sycl::info::device_type::gpu);
1449 for (
const sycl::device &device : oneapi_devices) {
1450 bool filter_out =
false;
1452 if (platform.get_backend() == sycl::backend::ext_oneapi_level_zero && device.is_gpu() &&
1453 device.get_info<sycl::info::device::host_unified_memory>() ==
false
1456 level_zero_dgpu_counter++;
1459 if (!allow_all_devices) {
1463 if (!device.is_gpu() || platform.get_backend() != sycl::backend::ext_oneapi_level_zero) {
1468 int number_of_eus = 96;
1469 int threads_per_eu = 7;
1470 if (device.has(sycl::aspect::ext_intel_gpu_eu_count)) {
1471 number_of_eus = device.get_info<sycl::ext::intel::info::device::gpu_eu_count>();
1473 if (device.has(sycl::aspect::ext_intel_gpu_hw_threads_per_eu)) {
1475 device.get_info<sycl::ext::intel::info::device::gpu_hw_threads_per_eu>();
1478 if (number_of_eus <= 96 && threads_per_eu == 7) {
1482 bool check_driver_version = !filter_out;
1484 if (check_driver_version &&
1485 device.get_info<sycl::info::device::vendor>().find(
"Intel") == std::string::npos)
1487 check_driver_version =
false;
1495# if __LIBSYCL_MAJOR_VERSION < 8
1496 if (check_driver_version &&
1497 !
string_startswith(device.get_info<sycl::info::device::driver_version>(),
"1.3."))
1499 check_driver_version =
false;
1502 if (check_driver_version) {
1503 int driver_build_version = parse_driver_build_version(device);
1504 const int lowest_supported_driver_version = (driver_build_version > 100000) ?
1505 lowest_supported_driver_version_win :
1506 lowest_supported_driver_version_neo;
1507 if (driver_build_version < lowest_supported_driver_version) {
1511 << device.get_info<sycl::info::device::name>()
1512 <<
"\" is too old. Expected \"" << lowest_supported_driver_version
1513 <<
"\" or newer, but got \"" << driver_build_version <<
"\".";
1519 available_devices.push_back(device);
1524 catch (sycl::exception &
e) {
1525 VLOG_WARNING <<
"An error has been encountered while enumerating SYCL devices: " <<
e.what();
1528 if (multiple_dgpus_detected) {
1529 *multiple_dgpus_detected = level_zero_dgpu_counter > 1;
1532 return available_devices;
1535void OneapiDevice::architecture_information(
const SyclDevice *device,
1539 const sycl::ext::oneapi::experimental::architecture arch =
1540 reinterpret_cast<const sycl::device *
>(device)
1541 ->get_info<sycl::ext::oneapi::experimental::info::device::architecture>();
1543# define FILL_ARCH_INFO(architecture_code, is_arch_optimised) \
1544 case sycl::ext::oneapi::experimental::architecture ::architecture_code: \
1545 name = #architecture_code; \
1546 is_optimized = is_arch_optimised; \
1558 FILL_ARCH_INFO(intel_gpu_bdw,
false)
1559 FILL_ARCH_INFO(intel_gpu_skl,
false)
1560 FILL_ARCH_INFO(intel_gpu_kbl,
false)
1561 FILL_ARCH_INFO(intel_gpu_cfl,
false)
1562 FILL_ARCH_INFO(intel_gpu_apl,
false)
1563 FILL_ARCH_INFO(intel_gpu_glk,
false)
1564 FILL_ARCH_INFO(intel_gpu_whl,
false)
1565 FILL_ARCH_INFO(intel_gpu_aml,
false)
1566 FILL_ARCH_INFO(intel_gpu_cml,
false)
1567 FILL_ARCH_INFO(intel_gpu_icllp,
false)
1568 FILL_ARCH_INFO(intel_gpu_ehl,
false)
1569 FILL_ARCH_INFO(intel_gpu_tgllp,
false)
1570 FILL_ARCH_INFO(intel_gpu_rkl,
false)
1571 FILL_ARCH_INFO(intel_gpu_adl_s,
false)
1572 FILL_ARCH_INFO(intel_gpu_adl_p,
false)
1573 FILL_ARCH_INFO(intel_gpu_adl_n,
false)
1574 FILL_ARCH_INFO(intel_gpu_dg1,
false)
1575 FILL_ARCH_INFO(intel_gpu_dg2_g10,
true)
1576 FILL_ARCH_INFO(intel_gpu_dg2_g11,
true)
1577 FILL_ARCH_INFO(intel_gpu_dg2_g12,
true)
1578 FILL_ARCH_INFO(intel_gpu_pvc,
false)
1579 FILL_ARCH_INFO(intel_gpu_pvc_vg,
false)
1581 FILL_ARCH_INFO(intel_gpu_mtl_u,
true)
1582 FILL_ARCH_INFO(intel_gpu_mtl_h,
true)
1583 FILL_ARCH_INFO(intel_gpu_bmg_g21,
true)
1584 FILL_ARCH_INFO(intel_gpu_lnl_m,
true)
1588 is_optimized =
false;
1593char *OneapiDevice::device_capabilities()
1595 std::stringstream capabilities;
1597 const std::vector<sycl::device> &oneapi_devices = available_sycl_devices();
1598 for (
const sycl::device &device : oneapi_devices) {
1599# ifndef WITH_ONEAPI_SYCL_HOST_TASK
1600 const std::string &name = device.get_info<sycl::info::device::name>();
1602 const std::string &name =
"SYCL Host Task (Debug)";
1605 capabilities << std::string(
"\t") << name <<
"\n";
1606 capabilities <<
"\t\tsycl::info::platform::name\t\t\t"
1607 << device.get_platform().get_info<sycl::info::platform::name>() <<
"\n";
1610 bool is_optimised_for_arch;
1611 architecture_information(
1612 reinterpret_cast<const SyclDevice *
>(&device), arch_name, is_optimised_for_arch);
1613 capabilities <<
"\t\tsycl::info::device::architecture\t\t\t";
1614 capabilities << arch_name <<
"\n";
1615 capabilities <<
"\t\tsycl::info::device::is_cycles_optimized\t\t\t";
1616 capabilities << is_optimised_for_arch <<
"\n";
1618# define WRITE_ATTR(attribute_name, attribute_variable) \
1619 capabilities << "\t\tsycl::info::device::" #attribute_name "\t\t\t" << attribute_variable \
1621# define GET_ATTR(attribute) \
1623 capabilities << "\t\tsycl::info::device::" #attribute "\t\t\t" \
1624 << device.get_info<sycl::info::device ::attribute>() << "\n"; \
1626# define GET_INTEL_ATTR(attribute) \
1628 if (device.has(sycl::aspect::ext_intel_##attribute)) { \
1629 capabilities << "\t\tsycl::ext::intel::info::device::" #attribute "\t\t\t" \
1630 << device.get_info<sycl::ext::intel::info::device ::attribute>() << "\n"; \
1633# define GET_ASPECT(aspect_) \
1635 capabilities << "\t\tdevice::has(" #aspect_ ")\t\t\t" << device.has(sycl::aspect ::aspect_) \
1640 GET_ATTR(driver_version)
1641 GET_ATTR(max_compute_units)
1642 GET_ATTR(max_clock_frequency)
1643 GET_ATTR(global_mem_size)
1644 GET_INTEL_ATTR(pci_address)
1645 GET_INTEL_ATTR(gpu_eu_simd_width)
1646 GET_INTEL_ATTR(gpu_eu_count)
1647 GET_INTEL_ATTR(gpu_slices)
1648 GET_INTEL_ATTR(gpu_subslices_per_slice)
1649 GET_INTEL_ATTR(gpu_eu_count_per_subslice)
1650 GET_INTEL_ATTR(gpu_hw_threads_per_eu)
1651 GET_INTEL_ATTR(max_mem_bandwidth)
1652 GET_ATTR(max_work_group_size)
1653 GET_ATTR(max_work_item_dimensions)
1654 sycl::id<3> max_work_item_sizes =
1655 device.get_info<sycl::info::device::max_work_item_sizes<3>>();
1656 WRITE_ATTR(max_work_item_sizes[0], max_work_item_sizes.get(0))
1657 WRITE_ATTR(max_work_item_sizes[1], max_work_item_sizes.get(1))
1658 WRITE_ATTR(max_work_item_sizes[2], max_work_item_sizes.get(2))
1660 GET_ATTR(max_num_sub_groups)
1661 for (
size_t sub_group_size : device.get_info<sycl::info::device::sub_group_sizes>()) {
1662 WRITE_ATTR(sub_group_size[], sub_group_size)
1664 GET_ATTR(sub_group_independent_forward_progress)
1666 GET_ATTR(preferred_vector_width_char)
1667 GET_ATTR(preferred_vector_width_short)
1668 GET_ATTR(preferred_vector_width_int)
1669 GET_ATTR(preferred_vector_width_long)
1670 GET_ATTR(preferred_vector_width_float)
1671 GET_ATTR(preferred_vector_width_double)
1672 GET_ATTR(preferred_vector_width_half)
1674 GET_ATTR(address_bits)
1675 GET_ATTR(max_mem_alloc_size)
1676 GET_ATTR(mem_base_addr_align)
1677 GET_ATTR(error_correction_support)
1678 GET_ATTR(is_available)
1679 GET_ATTR(host_unified_memory)
1684 GET_ASPECT(atomic64)
1685 GET_ASPECT(usm_host_allocations)
1686 GET_ASPECT(usm_device_allocations)
1687 GET_ASPECT(usm_shared_allocations)
1688 GET_ASPECT(usm_system_allocations)
1690# ifdef __SYCL_ANY_DEVICE_HAS_ext_oneapi_non_uniform_groups__
1691 GET_ASPECT(ext_oneapi_non_uniform_groups)
1693# ifdef __SYCL_ANY_DEVICE_HAS_ext_oneapi_bindless_images__
1694 GET_ASPECT(ext_oneapi_bindless_images)
1696# ifdef __SYCL_ANY_DEVICE_HAS_ext_oneapi_interop_semaphore_import__
1697 GET_ASPECT(ext_oneapi_interop_semaphore_import)
1699# ifdef __SYCL_ANY_DEVICE_HAS_ext_oneapi_interop_semaphore_export__
1700 GET_ASPECT(ext_oneapi_interop_semaphore_export)
1703# undef GET_INTEL_ATTR
1707 capabilities <<
"\n";
1710 return ::strdup(capabilities.str().c_str());
1713void OneapiDevice::iterate_devices(OneAPIDeviceIteratorCallback cb,
void *user_ptr)
1716 std::vector<sycl::device> devices = available_sycl_devices();
1717 for (sycl::device &device : devices) {
1718 const std::string &platform_name =
1719 device.get_platform().get_info<sycl::info::platform::name>();
1720# ifndef WITH_ONEAPI_SYCL_HOST_TASK
1721 std::string name = device.get_info<sycl::info::device::name>();
1723 std::string name =
"SYCL Host Task (Debug)";
1725# ifdef WITH_EMBREE_GPU
1726 bool hwrt_support = rtcIsSYCLDeviceSupported(device);
1728 bool hwrt_support =
false;
1730# if defined(WITH_OPENIMAGEDENOISE) && OIDN_VERSION >= 20300
1731 bool oidn_support = oidnIsSYCLDeviceSupported(&device);
1733 bool oidn_support =
false;
1735 std::string
id =
"ONEAPI_" + platform_name +
"_" + name;
1738 bool is_optimised_for_arch;
1739 architecture_information(
1740 reinterpret_cast<const SyclDevice *
>(&device), arch_name, is_optimised_for_arch);
1742 if (device.has(sycl::aspect::ext_intel_pci_address)) {
1743 id.append(
"_" + device.get_info<sycl::ext::intel::info::device::pci_address>());
1750 is_optimised_for_arch,
1756size_t OneapiDevice::get_memcapacity()
1758 return reinterpret_cast<sycl::queue *
>(device_queue_)
1760 .get_info<sycl::info::device::global_mem_size>();
1763int OneapiDevice::get_num_multiprocessors()
1765 const sycl::device &device =
reinterpret_cast<sycl::queue *
>(device_queue_)->get_device();
1766 if (device.has(sycl::aspect::ext_intel_gpu_eu_count)) {
1767 return device.get_info<sycl::ext::intel::info::device::gpu_eu_count>();
1769 return device.get_info<sycl::info::device::max_compute_units>();
1772int OneapiDevice::get_max_num_threads_per_multiprocessor()
1774 const sycl::device &device =
reinterpret_cast<sycl::queue *
>(device_queue_)->get_device();
1775 if (device.has(sycl::aspect::ext_intel_gpu_eu_simd_width) &&
1776 device.has(sycl::aspect::ext_intel_gpu_hw_threads_per_eu))
1778 return device.get_info<sycl::ext::intel::info::device::gpu_eu_simd_width>() *
1779 device.get_info<sycl::ext::intel::info::device::gpu_hw_threads_per_eu>();
1783 return device.get_info<sycl::info::device::max_work_group_size>();
void BLI_kdtree_nd_ free(KDTree *tree)
ATTR_WARN_UNUSED_RESULT const size_t num
for(;discarded_id_iter !=nullptr;discarded_id_iter=static_cast< ID * >(discarded_id_iter->next))
BMesh const char void * data
ATTR_WARN_UNUSED_RESULT const BMVert const BMEdge * e
unsigned long long int uint64_t
static DBVT_INLINE btScalar size(const btDbvtVolume &a)
void refit(btStridingMeshInterface *triangles, const btVector3 &aabbMin, const btVector3 &aabbMax)
SIMD_FORCE_INLINE const btScalar & w() const
Return the w value.
static PointerRNA * get_pointer_type(ButsContextPath *path, StructRNA *type)
bool use_hardware_raytracing
virtual void host_free(const MemoryType type, void *host_pointer, const size_t size)
virtual void build_bvh(BVH *bvh, Progress &progress, bool refit)
virtual void * host_alloc(const MemoryType type, const size_t size)
void mem_alloc(const size_t size)
void mem_free(const size_t size)
size_t memory_elements_size(const int elements)
device_ptr device_pointer
#define KERNEL_DATA_ARRAY(type, name)
#define KERNEL_FEATURE_NODE_RAYTRACE
#define KERNEL_FEATURE_MNEE
#define CCL_NAMESPACE_END
#define assert(assertion)
@ DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT
@ DEVICE_KERNEL_INTEGRATOR_SHADE_DEDICATED_LIGHT
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE
@ DEVICE_KERNEL_SHADER_EVAL_DISPLACE
@ DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE
@ DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW
@ DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK
@ DEVICE_KERNEL_SHADER_EVAL_BACKGROUND
@ DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE
@ DEVICE_KERNEL_INTEGRATOR_INTERSECT_DEDICATED_LIGHT
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE
@ DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA
@ DEVICE_KERNEL_CRYPTOMATTE_POSTPROCESS
@ DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_INIT_FROM_BAKE
@ DEVICE_KERNEL_SHADER_EVAL_CURVE_SHADOW_TRANSPARENCY
@ DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME
@ DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW
@ DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST
@ DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND
#define DCHECK(expression)
string string_human_readable_size(size_t size)
string string_human_readable_number(size_t num)
bool string_startswith(const string_view s, const string_view start)
CCL_NAMESPACE_BEGIN string string_printf(const char *format,...)
CCL_NAMESPACE_END IntegratorStateGPU * integrator_state
std::unique_lock< std::mutex > thread_scoped_lock
ccl_device_inline size_t round_up(const size_t x, const size_t multiple)
@ IMAGE_DATA_TYPE_NANOVDB_FP16
@ IMAGE_DATA_TYPE_NANOVDB_FLOAT
@ IMAGE_DATA_TYPE_NANOVDB_FLOAT3
@ IMAGE_DATA_TYPE_NANOVDB_FPN