Blender V4.5
queue.mm
Go to the documentation of this file.
1/* SPDX-FileCopyrightText: 2021-2022 Blender Foundation
2 *
3 * SPDX-License-Identifier: Apache-2.0 */
4
5#ifdef WITH_METAL
6
7# include <algorithm>
8# include <mutex>
9
10# include "device/metal/queue.h"
11
14# include "device/metal/kernel.h"
15
16# include "util/path.h"
17# include "util/string.h"
18# include "util/time.h"
19
21
22/* MetalDeviceQueue */
23
24MetalDeviceQueue::MetalDeviceQueue(MetalDevice *device)
25 : DeviceQueue(device), metal_device_(device), stats_(device->stats)
26{
27 @autoreleasepool {
28 command_buffer_desc_ = [[MTLCommandBufferDescriptor alloc] init];
29 command_buffer_desc_.errorOptions = MTLCommandBufferErrorOptionEncoderExecutionStatus;
30
31 mtlDevice_ = device->mtlDevice;
32 mtlCommandQueue_ = device->mtlComputeCommandQueue;
33
34 shared_event_ = [mtlDevice_ newSharedEvent];
35 shared_event_id_ = 1;
36
37 /* Shareable event listener */
38 event_queue_ = dispatch_queue_create("com.cycles.metal.event_queue", nullptr);
39 shared_event_listener_ = [[MTLSharedEventListener alloc] initWithDispatchQueue:event_queue_];
40
41 wait_semaphore_ = dispatch_semaphore_create(0);
42
43 if (auto *str = getenv("CYCLES_METAL_PROFILING")) {
44 if (atoi(str) && [mtlDevice_ supportsCounterSampling:MTLCounterSamplingPointAtStageBoundary])
45 {
46 /* Enable per-kernel timing breakdown (shown at end of render). */
47 profiling_enabled_ = true;
48 label_command_encoders_ = true;
49 }
50 }
51 if (getenv("CYCLES_METAL_DEBUG")) {
52 /* Enable very verbose tracing (shows every dispatch). */
53 verbose_tracing_ = true;
54 label_command_encoders_ = true;
55 }
56
57 setup_capture();
58 }
59}
60
61void MetalDeviceQueue::setup_capture()
62{
63 capture_kernel_ = DeviceKernel(-1);
64
65 if (auto *capture_kernel_str = getenv("CYCLES_DEBUG_METAL_CAPTURE_KERNEL")) {
66 /* CYCLES_DEBUG_METAL_CAPTURE_KERNEL captures a single dispatch of the specified kernel. */
67 capture_kernel_ = DeviceKernel(atoi(capture_kernel_str));
68 printf("Capture kernel: %d = %s\n", capture_kernel_, device_kernel_as_string(capture_kernel_));
69
70 capture_dispatch_counter_ = 0;
71 if (auto *capture_dispatch_str = getenv("CYCLES_DEBUG_METAL_CAPTURE_DISPATCH")) {
72 capture_dispatch_counter_ = atoi(capture_dispatch_str);
73
74 printf("Capture dispatch number %d\n", capture_dispatch_counter_);
75 }
76 }
77 else if (auto *capture_samples_str = getenv("CYCLES_DEBUG_METAL_CAPTURE_SAMPLES")) {
78 /* CYCLES_DEBUG_METAL_CAPTURE_SAMPLES captures a block of dispatches from reset#(N) to
79 * reset#(N+1). */
80 capture_samples_ = true;
81 capture_reset_counter_ = atoi(capture_samples_str);
82
83 capture_dispatch_counter_ = INT_MAX;
84 if (auto *capture_limit_str = getenv("CYCLES_DEBUG_METAL_CAPTURE_LIMIT")) {
85 /* CYCLES_DEBUG_METAL_CAPTURE_LIMIT sets the maximum number of dispatches to capture. */
86 capture_dispatch_counter_ = atoi(capture_limit_str);
87 }
88
89 printf("Capturing sample block %d (dispatch limit: %d)\n",
90 capture_reset_counter_,
91 capture_dispatch_counter_);
92 }
93 else {
94 /* No capturing requested. */
95 return;
96 }
97
98 /* Enable .gputrace capture for the specified DeviceKernel. */
99 MTLCaptureManager *captureManager = [MTLCaptureManager sharedCaptureManager];
100 mtlCaptureScope_ = [captureManager newCaptureScopeWithDevice:mtlDevice_];
101 mtlCaptureScope_.label = [NSString stringWithFormat:@"Cycles kernel dispatch"];
102 [captureManager setDefaultCaptureScope:mtlCaptureScope_];
103
104 label_command_encoders_ = true;
105
106 if (auto *capture_url = getenv("CYCLES_DEBUG_METAL_CAPTURE_URL")) {
107 if ([captureManager supportsDestination:MTLCaptureDestinationGPUTraceDocument]) {
108
109 MTLCaptureDescriptor *captureDescriptor = [[MTLCaptureDescriptor alloc] init];
110 captureDescriptor.captureObject = mtlCaptureScope_;
111 captureDescriptor.destination = MTLCaptureDestinationGPUTraceDocument;
112 captureDescriptor.outputURL = [NSURL fileURLWithPath:@(capture_url)];
113
114 NSError *error;
115 if (![captureManager startCaptureWithDescriptor:captureDescriptor error:&error]) {
116 NSString *err = [error localizedDescription];
117 printf("Start capture failed: %s\n", [err UTF8String]);
118 }
119 else {
120 printf("Capture started (URL: %s)\n", capture_url);
121 is_capturing_to_disk_ = true;
122 }
123 }
124 else {
125 printf("Capture to file is not supported\n");
126 }
127 }
128}
129
130void MetalDeviceQueue::update_capture(DeviceKernel kernel)
131{
132 /* Handle capture end triggers. */
133 if (is_capturing_) {
134 capture_dispatch_counter_ -= 1;
135 if (capture_dispatch_counter_ <= 0 || kernel == DEVICE_KERNEL_INTEGRATOR_RESET) {
136 /* End capture if we've hit the dispatch limit or we hit a "reset". */
137 end_capture();
138 }
139 return;
140 }
141
142 if (capture_dispatch_counter_ < 0) {
143 /* We finished capturing. */
144 return;
145 }
146
147 /* Handle single-capture start trigger. */
148 if (kernel == capture_kernel_) {
149 /* Start capturing when the we hit the Nth dispatch of the specified kernel. */
150 if (capture_dispatch_counter_ == 0) {
151 begin_capture();
152 }
153 capture_dispatch_counter_ -= 1;
154 return;
155 }
156
157 /* Handle multi-capture start trigger. */
158 if (capture_samples_) {
159 /* Start capturing when the reset countdown is at 0. */
160 if (capture_reset_counter_ == 0) {
161 begin_capture();
162 }
163
164 if (kernel == DEVICE_KERNEL_INTEGRATOR_RESET) {
165 capture_reset_counter_ -= 1;
166 }
167 return;
168 }
169}
170
171void MetalDeviceQueue::begin_capture()
172{
173 /* Start gputrace capture. */
174 if (mtlCommandBuffer_) {
175 synchronize();
176 }
177 [mtlCaptureScope_ beginScope];
178 printf("[mtlCaptureScope_ beginScope]\n");
179 is_capturing_ = true;
180}
181
182void MetalDeviceQueue::end_capture()
183{
184 [mtlCaptureScope_ endScope];
185 is_capturing_ = false;
186 printf("[mtlCaptureScope_ endScope]\n");
187
188 if (is_capturing_to_disk_) {
189 [[MTLCaptureManager sharedCaptureManager] stopCapture];
190 has_captured_to_disk_ = true;
191 is_capturing_to_disk_ = false;
192 is_capturing_ = false;
193 printf("Capture stopped\n");
194 }
195}
196
197MetalDeviceQueue::~MetalDeviceQueue()
198{
199 /* Tidying up here isn't really practical - we should expect and require the work
200 * queue to be empty here. */
201 assert(mtlCommandBuffer_ == nil);
202 assert(command_buffers_submitted_ == command_buffers_completed_);
203
204 close_compute_encoder();
205 close_blit_encoder();
206
207 [shared_event_listener_ release];
208 [shared_event_ release];
209 [command_buffer_desc_ release];
210
211 if (mtlCaptureScope_) {
212 [mtlCaptureScope_ release];
213 }
214
215 double total_time = 0.0;
216
217 /* Show per-kernel timings, if gathered (see CYCLES_METAL_PROFILING). */
218 int64_t num_dispatches = 0;
219 int64_t num_pathtracing_dispatches = 0;
220 for (size_t i = 0; i < DEVICE_KERNEL_NUM; i++) {
221 auto &stat = timing_stats_[i];
222 bool pathtracing_kernel = (i <= DEVICE_KERNEL_INTEGRATOR_RESET) &&
224 total_time += stat.total_time;
225 num_dispatches += stat.num_dispatches;
226 num_pathtracing_dispatches += pathtracing_kernel ? stat.num_dispatches : 0;
227 }
228 bool has_extra = (num_pathtracing_dispatches && num_dispatches > num_pathtracing_dispatches);
229
230 if (num_dispatches) {
231 printf("\nMetal %sdispatch stats:\n", num_pathtracing_dispatches ? "path-tracing " : "");
232 auto header = string_printf("%-40s %16s %12s %12s %9s %9s",
233 "Kernel name",
234 "Total threads",
235 "Dispatches",
236 "Avg. T/D",
237 "Time/s",
238 "Time/%");
239 auto divider = string(header.length(), '-');
240 printf("%s\n%s\n%s\n", divider.c_str(), header.c_str(), divider.c_str());
241
242 for (size_t i = 0; i < DEVICE_KERNEL_NUM; i++) {
243 auto &stat = timing_stats_[i];
244
245 bool pathtracing_kernel = (i <= DEVICE_KERNEL_INTEGRATOR_RESET) &&
247 if ((pathtracing_kernel && num_pathtracing_dispatches) || stat.num_dispatches > 0) {
248 printf("%-40s %16llu %12llu %12llu %9.4f %9.2f\n",
250 stat.total_work_size,
251 stat.num_dispatches,
252 stat.total_work_size / stat.num_dispatches,
253 stat.total_time,
254 stat.total_time * 100.0 / total_time);
255 }
256 if (has_extra && i == DEVICE_KERNEL_INTEGRATOR_RESET) {
257 printf("%s\n", divider.c_str());
258 }
259 }
260 printf("%s\n", divider.c_str());
261 printf("%-40s %16s %12llu %12s %9.4f %9.2f\n", "", "", num_dispatches, "", total_time, 100.0);
262 printf("%s\n\n", divider.c_str());
263 }
264}
265
266int MetalDeviceQueue::num_concurrent_states(const size_t state_size) const
267{
268 static int result = 0;
269 if (result) {
270 return result;
271 }
272
273 result = 4194304;
274
275 /* Increasing the state count doesn't notably benefit M1-family systems. */
276 if (MetalInfo::get_apple_gpu_architecture(metal_device_->mtlDevice) != APPLE_M1) {
277 size_t system_ram = system_physical_ram();
278 size_t allocated_so_far = [metal_device_->mtlDevice currentAllocatedSize];
279 size_t max_recommended_working_set = [metal_device_->mtlDevice recommendedMaxWorkingSetSize];
280
281 /* Determine whether we can double the state count, and leave enough GPU-available memory
282 * (1/8 the system RAM or 1GB - whichever is largest). Enlarging the state size allows us to
283 * keep dispatch sizes high and minimize work submission overheads. */
284 size_t min_headroom = std::max(system_ram / 8, size_t(1024 * 1024 * 1024));
285 size_t total_state_size = result * state_size;
286 if (max_recommended_working_set - allocated_so_far - total_state_size * 2 >= min_headroom) {
287 result *= 2;
288 metal_printf("Doubling state count to exploit available RAM (new size = %d)\n", result);
289 }
290 }
291 return result;
292}
293
294int MetalDeviceQueue::num_concurrent_busy_states(const size_t state_size) const
295{
296 /* A 1:4 busy:total ratio gives best rendering performance, independent of total state count. */
297 return num_concurrent_states(state_size) / 4;
298}
299
300int MetalDeviceQueue::num_sort_partitions(int max_num_paths, uint max_scene_shaders) const
301{
302 int sort_partition_elements = MetalInfo::optimal_sort_partition_elements();
303 /* Sort partitioning becomes less effective when more shaders are in the wavefront. In lieu of
304 * a more sophisticated heuristic we simply disable sort partitioning if the shader count is
305 * high.
306 */
307 if (max_scene_shaders < 300 && sort_partition_elements > 0) {
308 return max(max_num_paths / sort_partition_elements, 1);
309 }
310 else {
311 return 1;
312 }
313}
314
315bool MetalDeviceQueue::supports_local_atomic_sort() const
316{
317 return metal_device_->use_local_atomic_sort();
318}
319
320void MetalDeviceQueue::init_execution()
321{
322 /* Synchronize all textures and memory copies before executing task. */
323 metal_device_->load_texture_info();
324
325 synchronize();
326}
327
328bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
329 const int work_size,
330 const DeviceKernelArguments &args)
331{
332 @autoreleasepool {
333 update_capture(kernel);
334
335 if (metal_device_->have_error()) {
336 return false;
337 }
338
339 VLOG_DEVICE_STATS << "Metal queue launch " << device_kernel_as_string(kernel) << ", work_size "
340 << work_size;
341
342 id<MTLComputeCommandEncoder> mtlComputeCommandEncoder = get_compute_encoder(kernel);
343
344 if (profiling_enabled_) {
345 command_encoder_labels_.push_back({kernel, work_size, current_encoder_idx_});
346 }
347
348 /* Determine size requirement for argument buffer. */
349 size_t arg_buffer_length = 0;
350 for (size_t i = 0; i < args.count; i++) {
351 size_t size_in_bytes = args.sizes[i];
352 arg_buffer_length = round_up(arg_buffer_length, size_in_bytes) + size_in_bytes;
353 }
354 /* 256 is the Metal offset alignment for constant address space bindings */
355 arg_buffer_length = round_up(arg_buffer_length, 256);
356
357 /* Globals placed after "vanilla" arguments. */
358 size_t globals_offsets = arg_buffer_length;
359 arg_buffer_length += sizeof(KernelParamsMetal);
360 arg_buffer_length = round_up(arg_buffer_length, 256);
361
362 /* Metal ancillary bindless pointers. */
363 size_t metal_offsets = arg_buffer_length;
364 arg_buffer_length += metal_device_->mtlAncillaryArgEncoder.encodedLength;
365 arg_buffer_length = round_up(arg_buffer_length,
366 metal_device_->mtlAncillaryArgEncoder.alignment);
367
368 /* Temporary buffer used to prepare arg_buffer */
369 uint8_t *init_arg_buffer = (uint8_t *)alloca(arg_buffer_length);
370 memset(init_arg_buffer, 0, arg_buffer_length);
371
372 /* Prepare the non-pointer "enqueue" arguments */
373 size_t bytes_written = 0;
374 for (size_t i = 0; i < args.count; i++) {
375 size_t size_in_bytes = args.sizes[i];
376 bytes_written = round_up(bytes_written, size_in_bytes);
378 memcpy(init_arg_buffer + bytes_written, args.values[i], size_in_bytes);
379 }
380 bytes_written += size_in_bytes;
381 }
382
383 /* Prepare any non-pointer (i.e. plain-old-data) KernelParamsMetal data */
384 /* The plain-old-data is contiguous, continuing to the end of KernelParamsMetal */
385 size_t plain_old_launch_data_offset = offsetof(KernelParamsMetal, integrator_state) +
386 offsetof(IntegratorStateGPU, sort_partition_divisor);
387 size_t plain_old_launch_data_size = sizeof(KernelParamsMetal) - plain_old_launch_data_offset;
388 memcpy(init_arg_buffer + globals_offsets + plain_old_launch_data_offset,
389 (uint8_t *)&metal_device_->launch_params + plain_old_launch_data_offset,
390 plain_old_launch_data_size);
391
392 /* Allocate an argument buffer. */
393 id<MTLBuffer> arg_buffer = temp_buffer_pool_.get_buffer(
394 mtlDevice_, mtlCommandBuffer_, arg_buffer_length, init_arg_buffer, stats_);
395
396 /* Encode the pointer "enqueue" arguments */
397 bytes_written = 0;
398 for (size_t i = 0; i < args.count; i++) {
399 size_t size_in_bytes = args.sizes[i];
400 bytes_written = round_up(bytes_written, size_in_bytes);
402 [metal_device_->mtlBufferKernelParamsEncoder setArgumentBuffer:arg_buffer
403 offset:bytes_written];
404 if (MetalDevice::MetalMem *mmem = *(MetalDevice::MetalMem **)args.values[i]) {
405 [mtlComputeCommandEncoder useResource:mmem->mtlBuffer
406 usage:MTLResourceUsageRead | MTLResourceUsageWrite];
407 [metal_device_->mtlBufferKernelParamsEncoder setBuffer:mmem->mtlBuffer
408 offset:0
409 atIndex:0];
410 }
411 else {
412 if (@available(macos 12.0, *)) {
413 [metal_device_->mtlBufferKernelParamsEncoder setBuffer:nil offset:0 atIndex:0];
414 }
415 }
416 }
417 bytes_written += size_in_bytes;
418 }
419
420 /* Encode KernelParamsMetal buffers */
421 [metal_device_->mtlBufferKernelParamsEncoder setArgumentBuffer:arg_buffer
422 offset:globals_offsets];
423
424 if (label_command_encoders_) {
425 /* Add human-readable labels if we're doing any form of debugging / profiling. */
426 mtlComputeCommandEncoder.label = [NSString
427 stringWithFormat:@"Metal queue launch %s, work_size %d",
429 work_size];
430 }
431
432 /* this relies on IntegratorStateGPU layout being contiguous device_ptrs. */
433 const size_t pointer_block_end = offsetof(KernelParamsMetal, integrator_state) +
434 offsetof(IntegratorStateGPU, sort_partition_divisor);
435 for (size_t offset = 0; offset < pointer_block_end; offset += sizeof(device_ptr)) {
436 int pointer_index = int(offset / sizeof(device_ptr));
437 MetalDevice::MetalMem *mmem = *(
438 MetalDevice::MetalMem **)((uint8_t *)&metal_device_->launch_params + offset);
439 if (mmem && mmem->mem && (mmem->mtlBuffer || mmem->mtlTexture)) {
440 [metal_device_->mtlBufferKernelParamsEncoder setBuffer:mmem->mtlBuffer
441 offset:0
442 atIndex:pointer_index];
443 }
444 else {
445 if (@available(macos 12.0, *)) {
446 [metal_device_->mtlBufferKernelParamsEncoder setBuffer:nil
447 offset:0
448 atIndex:pointer_index];
449 }
450 }
451 }
452 bytes_written = globals_offsets + sizeof(KernelParamsMetal);
453
454 if (!active_pipelines_[kernel].update(metal_device_, kernel)) {
455 metal_device_->set_error(
456 string_printf("Could not activate pipeline for %s\n", device_kernel_as_string(kernel)));
457 return false;
458 }
459 MetalDispatchPipeline &active_pipeline = active_pipelines_[kernel];
460
461 /* Encode ancillaries */
462 [metal_device_->mtlAncillaryArgEncoder setArgumentBuffer:arg_buffer offset:metal_offsets];
463 [metal_device_->mtlAncillaryArgEncoder setBuffer:metal_device_->texture_bindings_2d
464 offset:0
465 atIndex:0];
466 [metal_device_->mtlAncillaryArgEncoder setBuffer:metal_device_->texture_bindings_3d
467 offset:0
468 atIndex:1];
469 [metal_device_->mtlAncillaryArgEncoder setBuffer:metal_device_->buffer_bindings_1d
470 offset:0
471 atIndex:2];
472
473 if (@available(macos 12.0, *)) {
474 if (metal_device_->use_metalrt && device_kernel_has_intersection(kernel)) {
475 if (id<MTLAccelerationStructure> accel_struct = metal_device_->accel_struct) {
476 [metal_device_->mtlAncillaryArgEncoder setAccelerationStructure:accel_struct atIndex:3];
477 [metal_device_->mtlAncillaryArgEncoder setBuffer:metal_device_->blas_buffer
478 offset:0
479 atIndex:(METALRT_TABLE_NUM + 4)];
480 }
481
482 for (int table = 0; table < METALRT_TABLE_NUM; table++) {
483 if (active_pipeline.intersection_func_table[table]) {
484 [active_pipeline.intersection_func_table[table] setBuffer:arg_buffer
485 offset:globals_offsets
486 atIndex:1];
487 [metal_device_->mtlAncillaryArgEncoder
488 setIntersectionFunctionTable:active_pipeline.intersection_func_table[table]
489 atIndex:4 + table];
490 [mtlComputeCommandEncoder useResource:active_pipeline.intersection_func_table[table]
491 usage:MTLResourceUsageRead];
492 }
493 else {
494 [metal_device_->mtlAncillaryArgEncoder setIntersectionFunctionTable:nil
495 atIndex:4 + table];
496 }
497 }
498 }
499 bytes_written = metal_offsets + metal_device_->mtlAncillaryArgEncoder.encodedLength;
500 }
501
502 [mtlComputeCommandEncoder setBuffer:arg_buffer offset:0 atIndex:0];
503 [mtlComputeCommandEncoder setBuffer:arg_buffer offset:globals_offsets atIndex:1];
504 [mtlComputeCommandEncoder setBuffer:arg_buffer offset:metal_offsets atIndex:2];
505
506 if (metal_device_->use_metalrt && device_kernel_has_intersection(kernel)) {
507 if (@available(macos 12.0, *)) {
508
509 if (id<MTLAccelerationStructure> accel_struct = metal_device_->accel_struct) {
510 /* Mark all Accelerations resources as used */
511 [mtlComputeCommandEncoder useResource:accel_struct usage:MTLResourceUsageRead];
512 if (metal_device_->blas_buffer) {
513 [mtlComputeCommandEncoder useResource:metal_device_->blas_buffer
514 usage:MTLResourceUsageRead];
515 }
516 [mtlComputeCommandEncoder useResources:metal_device_->unique_blas_array.data()
517 count:metal_device_->unique_blas_array.size()
518 usage:MTLResourceUsageRead];
519 }
520 }
521 }
522
523 [mtlComputeCommandEncoder setComputePipelineState:active_pipeline.pipeline];
524
525 /* Compute kernel launch parameters. */
526 const int num_threads_per_block = active_pipeline.num_threads_per_block;
527
528 int shared_mem_bytes = 0;
529
530 switch (kernel) {
539 /* See parallel_active_index.h for why this amount of shared memory is needed.
540 * Rounded up to 16 bytes for Metal */
541 shared_mem_bytes = (int)round_up((num_threads_per_block + 1) * sizeof(int), 16);
542 break;
543
546 int key_count = metal_device_->launch_params.data.max_shaders;
547 shared_mem_bytes = (int)round_up(key_count * sizeof(int), 16);
548 break;
549 }
550
551 default:
552 break;
553 }
554
555 if (shared_mem_bytes) {
556 assert(shared_mem_bytes <= 32 * 1024);
557 [mtlComputeCommandEncoder setThreadgroupMemoryLength:shared_mem_bytes atIndex:0];
558 }
559
560 MTLSize size_threads_per_dispatch = MTLSizeMake(work_size, 1, 1);
561 MTLSize size_threads_per_threadgroup = MTLSizeMake(num_threads_per_block, 1, 1);
562 [mtlComputeCommandEncoder dispatchThreads:size_threads_per_dispatch
563 threadsPerThreadgroup:size_threads_per_threadgroup];
564
565 [mtlCommandBuffer_ addCompletedHandler:^(id<MTLCommandBuffer> command_buffer) {
566 /* Enhanced command buffer errors */
567 string str;
568 if (command_buffer.status != MTLCommandBufferStatusCompleted) {
569 str = string_printf("Command buffer not completed. status = %d. ",
570 int(command_buffer.status));
571 }
572 if (command_buffer.error) {
573 @autoreleasepool {
574 const char *errCStr = [[NSString stringWithFormat:@"%@", command_buffer.error]
575 UTF8String];
576 str += string_printf("(%s.%s):\n%s\n",
577 kernel_type_as_string(active_pipeline.pso_type),
579 errCStr);
580 }
581 }
582 if (!str.empty()) {
583 metal_device_->set_error(str);
584 }
585 }];
586
587 if (verbose_tracing_ || is_capturing_) {
588 /* Force a sync we've enabled step-by-step verbose tracing or if we're capturing. */
589 synchronize();
590
591 /* Show queue counters and dispatch timing. */
592 if (verbose_tracing_) {
593 if (kernel == DEVICE_KERNEL_INTEGRATOR_RESET) {
594 printf(
595 "_____________________________________.____________________.______________._________"
596 "__"
597 "______________________________________\n");
598 }
599
600 printf("%-40s| %7d threads |%5.2fms | buckets [",
602 work_size,
603 last_completion_time_ * 1000.0);
604 std::lock_guard<std::recursive_mutex> lock(metal_device_->metal_mem_map_mutex);
605 for (auto &it : metal_device_->metal_mem_map) {
606 const string c_integrator_queue_counter = "integrator_queue_counter";
607 if (it.first->name == c_integrator_queue_counter) {
608 if (IntegratorQueueCounter *queue_counter = (IntegratorQueueCounter *)
609 it.first->host_pointer)
610 {
611 for (int i = 0; i < DEVICE_KERNEL_INTEGRATOR_NUM; i++) {
612 printf("%s%d", i == 0 ? "" : ",", queue_counter->num_queued[i]);
613 }
614 }
615 break;
616 }
617 }
618 printf("]\n");
619 }
620 }
621
622 return !(metal_device_->have_error());
623 }
624}
625
626void MetalDeviceQueue::flush_timing_stats()
627{
628 for (auto label : command_encoder_labels_) {
629 TimingStats &stat = timing_stats_[label.kernel];
630
631 double completion_time_gpu;
632 NSData *computeTimeStamps = [metal_device_->mtlCounterSampleBuffer
633 resolveCounterRange:NSMakeRange(label.timing_id, 2)];
634 MTLCounterResultTimestamp *timestamps = (MTLCounterResultTimestamp *)(computeTimeStamps.bytes);
635
636 uint64_t begTime = timestamps[0].timestamp;
637 uint64_t endTime = timestamps[1].timestamp;
638 completion_time_gpu = (endTime - begTime) / (double)NSEC_PER_SEC;
639
640 stat.num_dispatches++;
641 stat.total_time += completion_time_gpu;
642 stat.total_work_size += label.work_size;
643 last_completion_time_ = completion_time_gpu;
644 }
645 command_encoder_labels_.clear();
646}
647
648bool MetalDeviceQueue::synchronize()
649{
650 @autoreleasepool {
651 if (has_captured_to_disk_ || metal_device_->have_error()) {
652 return false;
653 }
654
655 close_compute_encoder();
656 close_blit_encoder();
657
658 if (mtlCommandBuffer_) {
660
661 uint64_t shared_event_id_ = this->shared_event_id_++;
662
663 __block dispatch_semaphore_t block_sema = wait_semaphore_;
664 [shared_event_ notifyListener:shared_event_listener_
665 atValue:shared_event_id_
666 block:^(id<MTLSharedEvent> /*sharedEvent*/, uint64_t /*value*/) {
667 dispatch_semaphore_signal(block_sema);
668 }];
669
670 [mtlCommandBuffer_ encodeSignalEvent:shared_event_ value:shared_event_id_];
671 [mtlCommandBuffer_ commit];
672 dispatch_semaphore_wait(wait_semaphore_, DISPATCH_TIME_FOREVER);
673
674 [mtlCommandBuffer_ release];
675
676 temp_buffer_pool_.process_command_buffer_completion(mtlCommandBuffer_);
677 metal_device_->flush_delayed_free_list();
678
679 mtlCommandBuffer_ = nil;
680 flush_timing_stats();
681 }
682
683 return !(metal_device_->have_error());
684 }
685}
686
687void MetalDeviceQueue::zero_to_device(device_memory &mem)
688{
689 @autoreleasepool {
690 if (metal_device_->have_error()) {
691 return;
692 }
693
694 assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE);
695
696 if (mem.memory_size() == 0) {
697 return;
698 }
699
700 /* Allocate on demand. */
701 if (mem.device_pointer == 0) {
702 metal_device_->mem_alloc(mem);
703 }
704
705 /* Zero memory on device. */
706 assert(mem.device_pointer != 0);
707
708 std::lock_guard<std::recursive_mutex> lock(metal_device_->metal_mem_map_mutex);
709 MetalDevice::MetalMem &mmem = *metal_device_->metal_mem_map.at(&mem);
710 if (mmem.mtlBuffer) {
711 id<MTLBlitCommandEncoder> blitEncoder = get_blit_encoder();
712 [blitEncoder fillBuffer:mmem.mtlBuffer range:NSMakeRange(mmem.offset, mmem.size) value:0];
713 }
714 else {
715 metal_device_->mem_zero(mem);
716 }
717 }
718}
719
720void MetalDeviceQueue::copy_to_device(device_memory &mem)
721{
722 @autoreleasepool {
723 if (metal_device_->have_error()) {
724 return;
725 }
726
727 if (mem.memory_size() == 0) {
728 return;
729 }
730
731 /* Allocate on demand. */
732 if (mem.device_pointer == 0) {
733 metal_device_->mem_alloc(mem);
734 }
735
736 assert(mem.device_pointer != 0);
737 assert(mem.host_pointer != nullptr);
738 /* No need to copy - Apple Silicon has Unified Memory Architecture. */
739 }
740}
741
742void MetalDeviceQueue::copy_from_device(device_memory & /*mem*/)
743{
744 /* No need to copy - Apple Silicon has Unified Memory Architecture. */
745}
746
747void MetalDeviceQueue::prepare_resources(DeviceKernel /*kernel*/)
748{
749 std::lock_guard<std::recursive_mutex> lock(metal_device_->metal_mem_map_mutex);
750
751 /* declare resource usage */
752 for (auto &it : metal_device_->metal_mem_map) {
753 device_memory *mem = it.first;
754
755 MTLResourceUsage usage = MTLResourceUsageRead;
756 if (mem->type != MEM_GLOBAL && mem->type != MEM_READ_ONLY && mem->type != MEM_TEXTURE) {
757 usage |= MTLResourceUsageWrite;
758 }
759
760 if (it.second->mtlBuffer) {
761 /* METAL_WIP - use array version (i.e. useResources) */
762 [mtlComputeEncoder_ useResource:it.second->mtlBuffer usage:usage];
763 }
764 else if (it.second->mtlTexture) {
765 /* METAL_WIP - use array version (i.e. useResources) */
766 [mtlComputeEncoder_ useResource:it.second->mtlTexture usage:usage | MTLResourceUsageSample];
767 }
768 }
769
770 /* ancillaries */
771 [mtlComputeEncoder_ useResource:metal_device_->texture_bindings_2d usage:MTLResourceUsageRead];
772 [mtlComputeEncoder_ useResource:metal_device_->texture_bindings_3d usage:MTLResourceUsageRead];
773 [mtlComputeEncoder_ useResource:metal_device_->buffer_bindings_1d usage:MTLResourceUsageRead];
774}
775
776id<MTLComputeCommandEncoder> MetalDeviceQueue::get_compute_encoder(DeviceKernel kernel)
777{
778 bool concurrent = int(kernel) < int(DEVICE_KERNEL_INTEGRATOR_NUM);
779
780 if (profiling_enabled_) {
781 /* Close the current encoder to ensure we're able to capture per-encoder timing data. */
782 close_compute_encoder();
783 }
784
785 if (mtlComputeEncoder_) {
786 if (mtlComputeEncoder_.dispatchType == concurrent ? MTLDispatchTypeConcurrent :
787 MTLDispatchTypeSerial)
788 {
789 /* declare usage of MTLBuffers etc */
790 prepare_resources(kernel);
791
792 return mtlComputeEncoder_;
793 }
794 close_compute_encoder();
795 }
796
797 close_blit_encoder();
798
799 if (!mtlCommandBuffer_) {
800 mtlCommandBuffer_ = [mtlCommandQueue_ commandBuffer];
801 [mtlCommandBuffer_ retain];
802 }
803
804 if (profiling_enabled_) {
805 MTLComputePassDescriptor *desc = [[MTLComputePassDescriptor alloc] init];
806
807 current_encoder_idx_ = (counter_sample_buffer_curr_idx_.fetch_add(2) %
808 MAX_SAMPLE_BUFFER_LENGTH);
809 [desc.sampleBufferAttachments[0] setSampleBuffer:metal_device_->mtlCounterSampleBuffer];
810 [desc.sampleBufferAttachments[0] setStartOfEncoderSampleIndex:current_encoder_idx_];
811 [desc.sampleBufferAttachments[0] setEndOfEncoderSampleIndex:current_encoder_idx_ + 1];
812
813 [desc setDispatchType:concurrent ? MTLDispatchTypeConcurrent : MTLDispatchTypeSerial];
814
815 mtlComputeEncoder_ = [mtlCommandBuffer_ computeCommandEncoderWithDescriptor:desc];
816 }
817 else {
818 mtlComputeEncoder_ = [mtlCommandBuffer_
819 computeCommandEncoderWithDispatchType:concurrent ? MTLDispatchTypeConcurrent :
820 MTLDispatchTypeSerial];
821 }
822
823 [mtlComputeEncoder_ retain];
824 [mtlComputeEncoder_ setLabel:@(device_kernel_as_string(kernel))];
825
826 /* declare usage of MTLBuffers etc */
827 prepare_resources(kernel);
828
829 return mtlComputeEncoder_;
830}
831
832id<MTLBlitCommandEncoder> MetalDeviceQueue::get_blit_encoder()
833{
834 if (mtlBlitEncoder_) {
835 return mtlBlitEncoder_;
836 }
837
838 close_compute_encoder();
839
840 if (!mtlCommandBuffer_) {
841 mtlCommandBuffer_ = [mtlCommandQueue_ commandBuffer];
842 [mtlCommandBuffer_ retain];
843 }
844
845 mtlBlitEncoder_ = [mtlCommandBuffer_ blitCommandEncoder];
846 [mtlBlitEncoder_ retain];
847 return mtlBlitEncoder_;
848}
849
850void MetalDeviceQueue::close_compute_encoder()
851{
852 if (mtlComputeEncoder_) {
853 [mtlComputeEncoder_ endEncoding];
854 [mtlComputeEncoder_ release];
855 mtlComputeEncoder_ = nil;
856 }
857}
858
859void MetalDeviceQueue::close_blit_encoder()
860{
861 if (mtlBlitEncoder_) {
862 [mtlBlitEncoder_ endEncoding];
863 [mtlBlitEncoder_ release];
864 mtlBlitEncoder_ = nil;
865 }
866}
867
868void *MetalDeviceQueue::native_queue()
869{
870 return mtlCommandQueue_;
871}
872
873unique_ptr<DeviceGraphicsInterop> MetalDeviceQueue::graphics_interop_create()
874{
875 return make_unique<MetalDeviceGraphicsInterop>(this);
876}
877
879
880#endif /* WITH_METAL */
unsigned int uint
volatile int lock
long long int int64_t
unsigned long long int uint64_t
@ MEM_TEXTURE
@ MEM_READ_ONLY
#define CCL_NAMESPACE_END
bool device_kernel_has_intersection(DeviceKernel kernel)
const char * device_kernel_as_string(DeviceKernel kernel)
#define offsetof(t, d)
#define str(s)
#define assert(assertion)
#define printf(...)
int count
ccl_gpu_kernel_postfix const ccl_global int ccl_global float const int work_size
@ DEVICE_KERNEL_INTEGRATOR_NUM
DeviceKernel
@ DEVICE_KERNEL_INTEGRATOR_RESET
@ DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_SORT_WRITE_PASS
@ DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_SORT_BUCKET_PASS
@ DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL
@ DEVICE_KERNEL_NUM
@ DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY
#define VLOG_DEVICE_STATS
Definition log.h:77
static void error(const char *str)
static void update(bNodeTree *ntree)
static void init(bNodeTree *, bNode *node)
CCL_NAMESPACE_BEGIN string string_printf(const char *format,...)
Definition string.cpp:23
void * values[MAX_ARGS]
size_t sizes[MAX_ARGS]
Type types[MAX_ARGS]
size_t system_physical_ram()
Definition system.cpp:227
i
Definition text_draw.cc:230
max
Definition text_draw.cc:251
ccl_device_inline size_t round_up(const size_t x, const size_t multiple)
Definition types_base.h:57
uint64_t device_ptr
Definition types_base.h:44
wmTimer * timer
double total_time